summaryrefslogtreecommitdiffstats
path: root/media/libvpx/libvpx/vp9/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'media/libvpx/libvpx/vp9/encoder')
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c2173
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c356
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c296
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c102
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c844
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c49
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c872
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c408
-rw-r--r--media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c849
-rw-r--r--media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c108
-rw-r--r--media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c501
-rw-r--r--media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c98
-rw-r--r--media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c65
-rw-r--r--media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h116
-rw-r--r--media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c287
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c63
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h127
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c75
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h27
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c160
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h36
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c702
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h147
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c247
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h34
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c1387
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h49
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_block.h225
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c135
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h26
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c161
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h106
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_cost.c65
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_cost.h57
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_dct.c687
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c839
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h132
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c6539
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h57
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c1078
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h60
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c271
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h38
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.c7030
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_encoder.h1642
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.c689
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ethread.h77
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c261
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h59
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.c203
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_extend.h31
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c3898
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h301
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c116
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h46
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c235
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h127
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c395
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h40
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c3035
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h178
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c334
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h41
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c302
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h54
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c536
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h129
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h975
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c203
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h29
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c2995
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h35
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_quantize.c320
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_quantize.h59
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c3339
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h357
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rd.c795
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rd.h235
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c4932
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h63
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_resize.c826
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_resize.h68
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c325
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h55
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c174
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h40
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c1088
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h674
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_subexp.c196
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_subexp.h41
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c1348
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h288
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c1205
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h46
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h410
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c490
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h130
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c1485
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h46
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c58
-rw-r--r--media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h51
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c893
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c875
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c1537
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm69
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c327
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c291
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c161
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm115
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c907
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c72
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c441
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c125
-rw-r--r--media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c253
114 files changed, 69860 insertions, 0 deletions
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 0000000000..997b5477e1
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,2173 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
+ int stride) {
+ // { 0, 1, 1, 1 };
+ const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+ // { 1, 0, 0, 0 };
+ const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+ int16x4_t mask;
+
+ int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+ // one non-zero first elements
+ mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+ input_0 = vadd_s16(input_0, mask);
+ input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+ in[0] = vcombine_s16(input_0, input_1);
+ in[1] = vcombine_s16(input_2, input_3);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
+ const int16x8_t one_s16 = vdupq_n_s16(1);
+ res[0] = vaddq_s16(res[0], one_s16);
+ res[1] = vaddq_s16(res[1], one_s16);
+ res[0] = vshrq_n_s16(res[0], 2);
+ res[1] = vshrq_n_s16(res[1], 2);
+ store_s16q_to_tran_low(output + 0 * 8, res[0]);
+ store_s16q_to_tran_low(output + 1 * 8, res[1]);
+}
+
+static INLINE void fadst4x4_neon(int16x8_t *in) {
+ int32x4_t u[4], t[4];
+ int16x4_t s[4], out[4];
+
+ s[0] = vget_low_s16(in[0]); // | x_00 | x_01 | x_02 | x_03 |
+ s[1] = vget_high_s16(in[0]); // | x_10 | x_11 | x_12 | x_13 |
+ s[2] = vget_low_s16(in[1]); // | x_20 | x_21 | x_22 | x_23 |
+ s[3] = vget_high_s16(in[1]); // | x_30 | x_31 | x_32 | x_33 |
+
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+ t[0] = vmull_n_s16(s[0], sinpi_1_9);
+ t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+ t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+ // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+ t[1] = vmull_n_s16(s[0], sinpi_3_9);
+ t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+ t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+ // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+ t[2] = vmull_n_s16(s[0], sinpi_4_9);
+ t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+ t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+ // t3 = s2 * sinpi_3_9
+ t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
+ /*
+ * u0 = t0 + t3
+ * u1 = t1
+ * u2 = t2 - t3
+ * u3 = t2 - t0 + t3
+ */
+ u[0] = vaddq_s32(t[0], t[3]);
+ u[1] = t[1];
+ u[2] = vsubq_s32(t[2], t[3]);
+ u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
+
+ // fdct_round_shift
+ out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+ out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+ out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+ out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
+
+ transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+ in[0] = vcombine_s16(out[0], out[1]);
+ in[1] = vcombine_s16(out[2], out[3]);
+}
+
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ int16x8_t in[2];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, stride);
+ fadst4x4_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, stride);
+ // pass1 variant is not accurate enough
+ vpx_fdct4x4_pass2_neon((int16x4_t *)in);
+ fadst4x4_neon(in);
+ write_buffer_4x4(output, in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_4x4(input, in, stride);
+ fadst4x4_neon(in);
+ fadst4x4_neon(in);
+ write_buffer_4x4(output, in);
+ break;
+ }
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in,
+ int stride) {
+ in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2);
+ in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2);
+ in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2);
+ in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2);
+ in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2);
+ in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2);
+ in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2);
+ in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void right_shift_8x8(int16x8_t *res, const int bit) {
+ int16x8_t sign0 = vshrq_n_s16(res[0], 15);
+ int16x8_t sign1 = vshrq_n_s16(res[1], 15);
+ int16x8_t sign2 = vshrq_n_s16(res[2], 15);
+ int16x8_t sign3 = vshrq_n_s16(res[3], 15);
+ int16x8_t sign4 = vshrq_n_s16(res[4], 15);
+ int16x8_t sign5 = vshrq_n_s16(res[5], 15);
+ int16x8_t sign6 = vshrq_n_s16(res[6], 15);
+ int16x8_t sign7 = vshrq_n_s16(res[7], 15);
+
+ if (bit == 2) {
+ const int16x8_t const_rounding = vdupq_n_s16(1);
+ res[0] = vaddq_s16(res[0], const_rounding);
+ res[1] = vaddq_s16(res[1], const_rounding);
+ res[2] = vaddq_s16(res[2], const_rounding);
+ res[3] = vaddq_s16(res[3], const_rounding);
+ res[4] = vaddq_s16(res[4], const_rounding);
+ res[5] = vaddq_s16(res[5], const_rounding);
+ res[6] = vaddq_s16(res[6], const_rounding);
+ res[7] = vaddq_s16(res[7], const_rounding);
+ }
+
+ res[0] = vsubq_s16(res[0], sign0);
+ res[1] = vsubq_s16(res[1], sign1);
+ res[2] = vsubq_s16(res[2], sign2);
+ res[3] = vsubq_s16(res[3], sign3);
+ res[4] = vsubq_s16(res[4], sign4);
+ res[5] = vsubq_s16(res[5], sign5);
+ res[6] = vsubq_s16(res[6], sign6);
+ res[7] = vsubq_s16(res[7], sign7);
+
+ if (bit == 1) {
+ res[0] = vshrq_n_s16(res[0], 1);
+ res[1] = vshrq_n_s16(res[1], 1);
+ res[2] = vshrq_n_s16(res[2], 1);
+ res[3] = vshrq_n_s16(res[3], 1);
+ res[4] = vshrq_n_s16(res[4], 1);
+ res[5] = vshrq_n_s16(res[5], 1);
+ res[6] = vshrq_n_s16(res[6], 1);
+ res[7] = vshrq_n_s16(res[7], 1);
+ } else {
+ res[0] = vshrq_n_s16(res[0], 2);
+ res[1] = vshrq_n_s16(res[1], 2);
+ res[2] = vshrq_n_s16(res[2], 2);
+ res[3] = vshrq_n_s16(res[3], 2);
+ res[4] = vshrq_n_s16(res[4], 2);
+ res[5] = vshrq_n_s16(res[5], 2);
+ res[6] = vshrq_n_s16(res[6], 2);
+ res[7] = vshrq_n_s16(res[7], 2);
+ }
+}
+
+static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
+ int stride) {
+ store_s16q_to_tran_low(output + 0 * stride, res[0]);
+ store_s16q_to_tran_low(output + 1 * stride, res[1]);
+ store_s16q_to_tran_low(output + 2 * stride, res[2]);
+ store_s16q_to_tran_low(output + 3 * stride, res[3]);
+ store_s16q_to_tran_low(output + 4 * stride, res[4]);
+ store_s16q_to_tran_low(output + 5 * stride, res[5]);
+ store_s16q_to_tran_low(output + 6 * stride, res[6]);
+ store_s16q_to_tran_low(output + 7 * stride, res[7]);
+}
+
+static INLINE void fadst8x8_neon(int16x8_t *in) {
+ int16x4_t x_lo[8], x_hi[8];
+ int32x4_t s_lo[8], s_hi[8];
+ int32x4_t t_lo[8], t_hi[8];
+
+ x_lo[0] = vget_low_s16(in[7]);
+ x_hi[0] = vget_high_s16(in[7]);
+ x_lo[1] = vget_low_s16(in[0]);
+ x_hi[1] = vget_high_s16(in[0]);
+ x_lo[2] = vget_low_s16(in[5]);
+ x_hi[2] = vget_high_s16(in[5]);
+ x_lo[3] = vget_low_s16(in[2]);
+ x_hi[3] = vget_high_s16(in[2]);
+ x_lo[4] = vget_low_s16(in[3]);
+ x_hi[4] = vget_high_s16(in[3]);
+ x_lo[5] = vget_low_s16(in[4]);
+ x_hi[5] = vget_high_s16(in[4]);
+ x_lo[6] = vget_low_s16(in[1]);
+ x_hi[6] = vget_high_s16(in[1]);
+ x_lo[7] = vget_low_s16(in[6]);
+ x_hi[7] = vget_high_s16(in[6]);
+
+ // stage 1
+ // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+ cospi_2_64, cospi_30_64, &s_lo[0],
+ &s_hi[0], &s_lo[1], &s_hi[1]);
+
+ // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+ cospi_10_64, cospi_22_64, &s_lo[2],
+ &s_hi[2], &s_lo[3], &s_hi[3]);
+
+ // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+ cospi_18_64, cospi_14_64, &s_lo[4],
+ &s_hi[4], &s_lo[5], &s_hi[5]);
+
+ // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+ cospi_26_64, cospi_6_64, &s_lo[6],
+ &s_hi[6], &s_lo[7], &s_hi[7]);
+
+ // fdct_round_shift
+ t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+ t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+ t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+ t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+ t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+ t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+ t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+ t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+ t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+ cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+ &s_lo[5], &s_hi[5]);
+
+ // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+ -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+ &s_lo[7], &s_hi[7]);
+
+ // fdct_round_shift
+ // s0 + s2
+ t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+ t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+ // s1 + s3
+ t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+ t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+ // s0 - s2
+ t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+ t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+ // s1 - s3
+ t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+ t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+ // s4 + s6
+ t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+ // s5 + s7
+ t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+ // s4 - s6
+ t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
+ // s5 - s7
+ t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
+
+ // stage 3
+ // cospi_16_64 * (x2 + x3)
+ // cospi_16_64 * (x2 - x3)
+ butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+ cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+ &s_hi[3]);
+
+ // cospi_16_64 * (x6 + x7)
+ // cospi_16_64 * (x2 - x3)
+ butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+ cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+ &s_hi[7]);
+
+ // final fdct_round_shift
+ x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+ x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+ x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+ x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+ x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+ x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+ x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+ x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+
+ // x0, x1, x4, x5 narrow down to 16-bits directly
+ x_lo[0] = vmovn_s32(t_lo[0]);
+ x_hi[0] = vmovn_s32(t_hi[0]);
+ x_lo[1] = vmovn_s32(t_lo[1]);
+ x_hi[1] = vmovn_s32(t_hi[1]);
+ x_lo[4] = vmovn_s32(t_lo[4]);
+ x_hi[4] = vmovn_s32(t_hi[4]);
+ x_lo[5] = vmovn_s32(t_lo[5]);
+ x_hi[5] = vmovn_s32(t_hi[5]);
+
+ in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+ in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+ in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+ in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+ in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+ in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+ in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+ in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+}
+
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ int16x8_t in[8];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride);
+ fadst8x8_neon(in);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride);
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(in);
+ fadst8x8_neon(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_8x8(input, in, stride);
+ fadst8x8_neon(in);
+ fadst8x8_neon(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0,
+ int16x8_t *in1, int stride) {
+ // load first 8 columns
+ load_buffer_8x8(input, in0, stride);
+ load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ load_buffer_8x8(input, in1, stride);
+ load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0,
+ int16x8_t *in1, int stride) {
+ // write first 8 columns
+ write_buffer_8x8(output, in0, stride);
+ write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+
+ // write second 8 columns
+ output += 8;
+ write_buffer_8x8(output, in1, stride);
+ write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) {
+ // perform rounding operations
+ right_shift_8x8(res0, 2);
+ right_shift_8x8(res0 + 8, 2);
+ right_shift_8x8(res1, 2);
+ right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(int16x8_t *in) {
+ // perform 16x16 1-D DCT for 8 columns
+ int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
+ int16x4_t t_lo[8], t_hi[8];
+ int32x4_t u_lo[8], u_hi[8];
+
+ // stage 1
+ i[0] = vaddq_s16(in[0], in[15]);
+ i[1] = vaddq_s16(in[1], in[14]);
+ i[2] = vaddq_s16(in[2], in[13]);
+ i[3] = vaddq_s16(in[3], in[12]);
+ i[4] = vaddq_s16(in[4], in[11]);
+ i[5] = vaddq_s16(in[5], in[10]);
+ i[6] = vaddq_s16(in[6], in[9]);
+ i[7] = vaddq_s16(in[7], in[8]);
+
+ // pass1 variant is not accurate enough
+ vpx_fdct8x8_pass2_neon(i);
+ transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
+
+ // step 2
+ s1[0] = vsubq_s16(in[7], in[8]);
+ s1[1] = vsubq_s16(in[6], in[9]);
+ s1[2] = vsubq_s16(in[5], in[10]);
+ s1[3] = vsubq_s16(in[4], in[11]);
+ s1[4] = vsubq_s16(in[3], in[12]);
+ s1[5] = vsubq_s16(in[2], in[13]);
+ s1[6] = vsubq_s16(in[1], in[14]);
+ s1[7] = vsubq_s16(in[0], in[15]);
+
+ t[2] = vsubq_s16(s1[5], s1[2]);
+ t[3] = vsubq_s16(s1[4], s1[3]);
+ t[4] = vaddq_s16(s1[4], s1[3]);
+ t[5] = vaddq_s16(s1[5], s1[2]);
+
+ t_lo[2] = vget_low_s16(t[2]);
+ t_hi[2] = vget_high_s16(t[2]);
+ t_lo[3] = vget_low_s16(t[3]);
+ t_hi[3] = vget_high_s16(t[3]);
+ t_lo[4] = vget_low_s16(t[4]);
+ t_hi[4] = vget_high_s16(t[4]);
+ t_lo[5] = vget_low_s16(t[5]);
+ t_hi[5] = vget_high_s16(t[5]);
+
+ u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64);
+ u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64);
+ u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64);
+ u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64);
+ u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64);
+ u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64);
+ u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
+ u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
+
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+ t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+ t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+
+ s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+ s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
+ s2[4] = vcombine_s16(t_lo[4], t_hi[4]);
+ s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+
+ // step 3
+ s3[0] = vaddq_s16(s1[0], s2[3]);
+ s3[1] = vaddq_s16(s1[1], s2[2]);
+ s3[2] = vsubq_s16(s1[1], s2[2]);
+ s3[3] = vsubq_s16(s1[0], s2[3]);
+ s3[4] = vsubq_s16(s1[7], s2[4]);
+ s3[5] = vsubq_s16(s1[6], s2[5]);
+ s3[6] = vaddq_s16(s1[6], s2[5]);
+ s3[7] = vaddq_s16(s1[7], s2[4]);
+
+ // step 4
+ t_lo[0] = vget_low_s16(s3[0]);
+ t_hi[0] = vget_high_s16(s3[0]);
+ t_lo[1] = vget_low_s16(s3[1]);
+ t_hi[1] = vget_high_s16(s3[1]);
+ t_lo[2] = vget_low_s16(s3[2]);
+ t_hi[2] = vget_high_s16(s3[2]);
+ t_lo[3] = vget_low_s16(s3[3]);
+ t_hi[3] = vget_high_s16(s3[3]);
+ t_lo[4] = vget_low_s16(s3[4]);
+ t_hi[4] = vget_high_s16(s3[4]);
+ t_lo[5] = vget_low_s16(s3[5]);
+ t_hi[5] = vget_high_s16(s3[5]);
+ t_lo[6] = vget_low_s16(s3[6]);
+ t_hi[6] = vget_high_s16(s3[6]);
+ t_lo[7] = vget_low_s16(s3[7]);
+ t_hi[7] = vget_high_s16(s3[7]);
+
+ // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+ // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+ butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+ -cospi_8_64, cospi_24_64, &u_lo[1],
+ &u_hi[1], &u_lo[6], &u_hi[6]);
+
+ // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+ // u[2] = cospi_8_64 * t[5] + cospi_24_64 * t[2]
+ butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+ -cospi_24_64, cospi_8_64, &u_lo[5],
+ &u_hi[5], &u_lo[2], &u_hi[2]);
+
+ t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+ t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+
+ s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
+ s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+ s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+ s2[6] = vcombine_s16(t_lo[6], t_hi[6]);
+
+ // step 5
+ s1[0] = vaddq_s16(s3[0], s2[1]);
+ s1[1] = vsubq_s16(s3[0], s2[1]);
+ s1[2] = vaddq_s16(s3[3], s2[2]);
+ s1[3] = vsubq_s16(s3[3], s2[2]);
+ s1[4] = vsubq_s16(s3[4], s2[5]);
+ s1[5] = vaddq_s16(s3[4], s2[5]);
+ s1[6] = vsubq_s16(s3[7], s2[6]);
+ s1[7] = vaddq_s16(s3[7], s2[6]);
+
+ // step 6
+ t_lo[0] = vget_low_s16(s1[0]);
+ t_hi[0] = vget_high_s16(s1[0]);
+ t_lo[1] = vget_low_s16(s1[1]);
+ t_hi[1] = vget_high_s16(s1[1]);
+ t_lo[2] = vget_low_s16(s1[2]);
+ t_hi[2] = vget_high_s16(s1[2]);
+ t_lo[3] = vget_low_s16(s1[3]);
+ t_hi[3] = vget_high_s16(s1[3]);
+ t_lo[4] = vget_low_s16(s1[4]);
+ t_hi[4] = vget_high_s16(s1[4]);
+ t_lo[5] = vget_low_s16(s1[5]);
+ t_hi[5] = vget_high_s16(s1[5]);
+ t_lo[6] = vget_low_s16(s1[6]);
+ t_hi[6] = vget_high_s16(s1[6]);
+ t_lo[7] = vget_low_s16(s1[7]);
+ t_hi[7] = vget_high_s16(s1[7]);
+
+ // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+ // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+ cospi_2_64, cospi_30_64, &u_lo[0],
+ &u_hi[0], &u_lo[7], &u_hi[7]);
+
+ // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+ // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+ cospi_18_64, cospi_14_64, &u_lo[1],
+ &u_hi[1], &u_lo[6], &u_hi[6]);
+
+ // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+ // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+ cospi_10_64, cospi_22_64, &u_lo[2],
+ &u_hi[2], &u_lo[5], &u_hi[5]);
+
+ // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+ // u[4] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64
+ butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+ cospi_26_64, cospi_6_64, &u_lo[3],
+ &u_hi[3], &u_lo[4], &u_hi[4]);
+
+ // final fdct_round_shift
+ t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+ t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+ t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+ t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+ t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+ t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+ t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+ t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+ t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+ t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+ t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+
+ in[0] = i[0];
+ in[2] = i[1];
+ in[4] = i[2];
+ in[6] = i[3];
+ in[8] = i[4];
+ in[10] = i[5];
+ in[12] = i[6];
+ in[14] = i[7];
+ in[1] = vcombine_s16(t_lo[0], t_hi[0]);
+ in[3] = vcombine_s16(t_lo[4], t_hi[4]);
+ in[5] = vcombine_s16(t_lo[2], t_hi[2]);
+ in[7] = vcombine_s16(t_lo[6], t_hi[6]);
+ in[9] = vcombine_s16(t_lo[1], t_hi[1]);
+ in[11] = vcombine_s16(t_lo[5], t_hi[5]);
+ in[13] = vcombine_s16(t_lo[3], t_hi[3]);
+ in[15] = vcombine_s16(t_lo[7], t_hi[7]);
+}
+
+static void fadst16_8col(int16x8_t *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ int16x4_t x_lo[16], x_hi[16];
+ int32x4_t s_lo[16], s_hi[16];
+ int32x4_t t_lo[16], t_hi[16];
+
+ x_lo[0] = vget_low_s16(in[15]);
+ x_hi[0] = vget_high_s16(in[15]);
+ x_lo[1] = vget_low_s16(in[0]);
+ x_hi[1] = vget_high_s16(in[0]);
+ x_lo[2] = vget_low_s16(in[13]);
+ x_hi[2] = vget_high_s16(in[13]);
+ x_lo[3] = vget_low_s16(in[2]);
+ x_hi[3] = vget_high_s16(in[2]);
+ x_lo[4] = vget_low_s16(in[11]);
+ x_hi[4] = vget_high_s16(in[11]);
+ x_lo[5] = vget_low_s16(in[4]);
+ x_hi[5] = vget_high_s16(in[4]);
+ x_lo[6] = vget_low_s16(in[9]);
+ x_hi[6] = vget_high_s16(in[9]);
+ x_lo[7] = vget_low_s16(in[6]);
+ x_hi[7] = vget_high_s16(in[6]);
+ x_lo[8] = vget_low_s16(in[7]);
+ x_hi[8] = vget_high_s16(in[7]);
+ x_lo[9] = vget_low_s16(in[8]);
+ x_hi[9] = vget_high_s16(in[8]);
+ x_lo[10] = vget_low_s16(in[5]);
+ x_hi[10] = vget_high_s16(in[5]);
+ x_lo[11] = vget_low_s16(in[10]);
+ x_hi[11] = vget_high_s16(in[10]);
+ x_lo[12] = vget_low_s16(in[3]);
+ x_hi[12] = vget_high_s16(in[3]);
+ x_lo[13] = vget_low_s16(in[12]);
+ x_hi[13] = vget_high_s16(in[12]);
+ x_lo[14] = vget_low_s16(in[1]);
+ x_hi[14] = vget_high_s16(in[1]);
+ x_lo[15] = vget_low_s16(in[14]);
+ x_hi[15] = vget_high_s16(in[14]);
+
+ // stage 1
+ // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+ // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+ butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+ cospi_1_64, cospi_31_64, &s_lo[0],
+ &s_hi[0], &s_lo[1], &s_hi[1]);
+ // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+ // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+ butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+ cospi_5_64, cospi_27_64, &s_lo[2],
+ &s_hi[2], &s_lo[3], &s_hi[3]);
+ // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+ // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+ butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+ cospi_9_64, cospi_23_64, &s_lo[4],
+ &s_hi[4], &s_lo[5], &s_hi[5]);
+ // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+ // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+ butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+ cospi_13_64, cospi_19_64, &s_lo[6],
+ &s_hi[6], &s_lo[7], &s_hi[7]);
+ // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+ // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+ butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+ cospi_17_64, cospi_15_64, &s_lo[8],
+ &s_hi[8], &s_lo[9], &s_hi[9]);
+ // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+ // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+ butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+ cospi_21_64, cospi_11_64, &s_lo[10],
+ &s_hi[10], &s_lo[11], &s_hi[11]);
+ // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+ // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+ butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+ cospi_25_64, cospi_7_64, &s_lo[12],
+ &s_hi[12], &s_lo[13], &s_hi[13]);
+ // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+ // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+ butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+ cospi_29_64, cospi_3_64, &s_lo[14],
+ &s_hi[14], &s_lo[15], &s_hi[15]);
+
+ // fdct_round_shift
+ t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+ t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+ t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+ t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+ t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+ t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+ t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+ t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+ t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+ t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+ t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+ t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+ t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+ t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+ t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+ t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+ t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ s_lo[4] = t_lo[4];
+ s_hi[4] = t_hi[4];
+ s_lo[5] = t_lo[5];
+ s_hi[5] = t_hi[5];
+ s_lo[6] = t_lo[6];
+ s_hi[6] = t_hi[6];
+ s_lo[7] = t_lo[7];
+ s_hi[7] = t_hi[7];
+ // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+ cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+ &s_lo[9], &s_hi[9]);
+ // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+ cospi_20_64, cospi_12_64, &s_lo[10],
+ &s_hi[10], &s_lo[11], &s_hi[11]);
+ // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+ cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+ &s_lo[12], &s_hi[12]);
+ // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ cospi_12_64, cospi_20_64, &s_lo[15],
+ &s_hi[15], &s_lo[14], &s_hi[14]);
+
+ // s0 + s4
+ t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
+ t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]);
+ // s1 + s5
+ t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]);
+ t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]);
+ // s2 + s6
+ t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]);
+ t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]);
+ // s3 + s7
+ t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]);
+ t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]);
+ // s0 - s4
+ t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]);
+ t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]);
+ // s1 - s7
+ t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]);
+ t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]);
+ // s2 - s6
+ t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]);
+ t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]);
+ // s3 - s7
+ t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]);
+ t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]);
+ // s8 + s12
+ t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]);
+ t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]);
+ // s9 + s13
+ t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]);
+ t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]);
+ // s10 + s14
+ t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]);
+ t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]);
+ // s11 + s15
+ t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]);
+ t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]);
+ // s8 + s12
+ t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]);
+ t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]);
+ // s9 + s13
+ t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]);
+ t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]);
+ // s10 + s14
+ t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]);
+ t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]);
+ // s11 + s15
+ t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
+ t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
+
+ t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+ t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+ t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+ t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+ t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+ t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+ t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+ t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+ // stage 3
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+ cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+ &s_lo[5], &s_hi[5]);
+ // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+ &s_lo[6], &s_hi[6]);
+ s_lo[8] = t_lo[8];
+ s_hi[8] = t_hi[8];
+ s_lo[9] = t_lo[9];
+ s_hi[9] = t_hi[9];
+ s_lo[10] = t_lo[10];
+ s_hi[10] = t_hi[10];
+ s_lo[11] = t_lo[11];
+ s_hi[11] = t_hi[11];
+ // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+ cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+ &s_lo[13], &s_hi[13]);
+ // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+ &s_lo[14], &s_hi[14]);
+
+ // s0 + s4
+ t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+ t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+ // s1 + s3
+ t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+ t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+ // s0 - s4
+ t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+ t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+ // s1 - s3
+ t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+ t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+ // s4 + s6
+ t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]);
+ t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]);
+ // s5 + s7
+ t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]);
+ t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]);
+ // s4 - s6
+ t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]);
+ t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]);
+ // s5 - s7
+ t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]);
+ t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]);
+ // s8 + s10
+ t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]);
+ t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]);
+ // s9 + s11
+ t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]);
+ t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]);
+ // s8 - s10
+ t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]);
+ t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]);
+ // s9 - s11
+ t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]);
+ t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]);
+ // s12 + s14
+ t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]);
+ t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]);
+ // s13 + s15
+ t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]);
+ t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]);
+ // s12 - s14
+ t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]);
+ t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]);
+ // s13 - s15
+ t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
+ t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
+
+ t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+ t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+ t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+ t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+ t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+ t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+ t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+ t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+ t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+ t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+ t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+ t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+ t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+ t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+ t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+ t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+ // stage 4
+ // s2 = (-cospi_16_64) * (x2 + x3);
+ // s3 = cospi_16_64 * (x2 - x3);
+ butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+ -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+ &s_hi[3]);
+ // s6 = cospi_16_64 * (x6 + x7);
+ // s7 = cospi_16_64 * (-x6 + x7);
+ butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+ &s_hi[7]);
+ // s10 = cospi_16_64 * (x10 + x11);
+ // s11 = cospi_16_64 * (-x10 + x11);
+ butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+ cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+ &s_hi[11]);
+ // s14 = (-cospi_16_64) * (x14 + x15);
+ // s15 = cospi_16_64 * (x14 - x15);
+ butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+ &s_hi[15]);
+
+ // final fdct_round_shift
+ x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+ x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+ x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+ x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+ x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+ x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+ x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+ x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+ x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+ x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+ x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+ x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+ x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+ x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+ x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+ x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
+
+ // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
+ x_lo[0] = vmovn_s32(t_lo[0]);
+ x_hi[0] = vmovn_s32(t_hi[0]);
+ x_lo[1] = vmovn_s32(t_lo[1]);
+ x_hi[1] = vmovn_s32(t_hi[1]);
+ x_lo[4] = vmovn_s32(t_lo[4]);
+ x_hi[4] = vmovn_s32(t_hi[4]);
+ x_lo[5] = vmovn_s32(t_lo[5]);
+ x_hi[5] = vmovn_s32(t_hi[5]);
+ x_lo[8] = vmovn_s32(t_lo[8]);
+ x_hi[8] = vmovn_s32(t_hi[8]);
+ x_lo[9] = vmovn_s32(t_lo[9]);
+ x_hi[9] = vmovn_s32(t_hi[9]);
+ x_lo[12] = vmovn_s32(t_lo[12]);
+ x_hi[12] = vmovn_s32(t_hi[12]);
+ x_lo[13] = vmovn_s32(t_lo[13]);
+ x_hi[13] = vmovn_s32(t_hi[13]);
+
+ in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+ in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8]));
+ in[2] = vcombine_s16(x_lo[12], x_hi[12]);
+ in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+ in[4] = vcombine_s16(x_lo[6], x_hi[6]);
+ in[5] = vcombine_s16(x_lo[14], x_hi[14]);
+ in[6] = vcombine_s16(x_lo[10], x_hi[10]);
+ in[7] = vcombine_s16(x_lo[2], x_hi[2]);
+ in[8] = vcombine_s16(x_lo[3], x_hi[3]);
+ in[9] = vcombine_s16(x_lo[11], x_hi[11]);
+ in[10] = vcombine_s16(x_lo[15], x_hi[15]);
+ in[11] = vcombine_s16(x_lo[7], x_hi[7]);
+ in[12] = vcombine_s16(x_lo[5], x_hi[5]);
+ in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13]));
+ in[14] = vcombine_s16(x_lo[9], x_hi[9]);
+ in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+}
+
+static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+ // Left half.
+ fdct16_8col(in0);
+ // Right half.
+ fdct16_8col(in1);
+ transpose_s16_16x16(in0, in1);
+}
+
+static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+ fadst16_8col(in0);
+ fadst16_8col(in1);
+ transpose_s16_16x16(in0, in1);
+}
+
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ int16x8_t in0[16], in1[16];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in0, in1, stride);
+ fadst16x16_neon(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16x16_neon(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in0, in1, stride);
+ fdct16x16_neon(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16x16_neon(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_16x16(input, in0, in1, stride);
+ fadst16x16_neon(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16x16_neon(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+ int32x4_t *in /*[4]*/, int stride) {
+ // { 0, 1, 1, 1 };
+ const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+ // { 1, 0, 0, 0 };
+ const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+ int32x4_t mask;
+
+ in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+ in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+ in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+ in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+ // one non-zero first elements
+ mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+ in[0] = vaddq_s32(in[0], mask);
+ in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+ const int32x4_t one = vdupq_n_s32(1);
+ res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+ res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+ res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+ res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+ vst1q_s32(output + 0 * 4, res[0]);
+ vst1q_s32(output + 1 * 4, res[1]);
+ vst1q_s32(output + 2 * 4, res[2]);
+ vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+ int32x2_t s_lo[4], s_hi[4];
+ int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+ s_lo[0] = vget_low_s32(in[0]);
+ s_hi[0] = vget_high_s32(in[0]);
+ s_lo[1] = vget_low_s32(in[1]);
+ s_hi[1] = vget_high_s32(in[1]);
+ s_lo[2] = vget_low_s32(in[2]);
+ s_hi[2] = vget_high_s32(in[2]);
+ s_lo[3] = vget_low_s32(in[3]);
+ s_hi[3] = vget_high_s32(in[3]);
+
+ // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+ t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+ t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+ t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+ t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+ t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+ t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+ // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+ t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+ t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+ t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+ t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+ t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+ t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+ // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+ t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+ t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+ t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+ t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+ t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+ t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+ // t3 = s2 * sinpi_3_9
+ t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+ t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+ /*
+ * u0 = t0 + t3
+ * u1 = t1
+ * u2 = t2 - t3
+ * u3 = t2 - t0 + t3
+ */
+ u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+ u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+ u_lo[1] = t_lo[1];
+ u_hi[1] = t_hi[1];
+ u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+ u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+ u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+ u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+ // fdct_round_shift
+ in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+ in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+ in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+ in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+ vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+ transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t in[4];
+ // int i;
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_4x4(input, in, stride);
+ highbd_fadst4x4_neon(in);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_4x4(input, in, stride);
+ vpx_highbd_fdct4x4_pass1_neon(in);
+ highbd_fadst4x4_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_4x4(input, in, stride);
+ highbd_fadst4x4_neon(in);
+ highbd_fadst4x4_neon(in);
+ highbd_write_buffer_4x4(output, in);
+ break;
+ }
+}
+
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+ int32x4_t *lo /*[8]*/,
+ int32x4_t *hi /*[8]*/, int stride) {
+ int16x8_t in[8];
+ in[0] = vld1q_s16(input + 0 * stride);
+ in[1] = vld1q_s16(input + 1 * stride);
+ in[2] = vld1q_s16(input + 2 * stride);
+ in[3] = vld1q_s16(input + 3 * stride);
+ in[4] = vld1q_s16(input + 4 * stride);
+ in[5] = vld1q_s16(input + 5 * stride);
+ in[6] = vld1q_s16(input + 6 * stride);
+ in[7] = vld1q_s16(input + 7 * stride);
+ lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+ hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+ lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+ hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+ lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+ hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+ lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+ hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+ lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+ hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+ lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+ hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+ lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+ hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+ lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+ hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+ const int bit) {
+ int32x4_t sign_lo[8], sign_hi[8];
+ sign_lo[0] = vshrq_n_s32(lo[0], 31);
+ sign_hi[0] = vshrq_n_s32(hi[0], 31);
+ sign_lo[1] = vshrq_n_s32(lo[1], 31);
+ sign_hi[1] = vshrq_n_s32(hi[1], 31);
+ sign_lo[2] = vshrq_n_s32(lo[2], 31);
+ sign_hi[2] = vshrq_n_s32(hi[2], 31);
+ sign_lo[3] = vshrq_n_s32(lo[3], 31);
+ sign_hi[3] = vshrq_n_s32(hi[3], 31);
+ sign_lo[4] = vshrq_n_s32(lo[4], 31);
+ sign_hi[4] = vshrq_n_s32(hi[4], 31);
+ sign_lo[5] = vshrq_n_s32(lo[5], 31);
+ sign_hi[5] = vshrq_n_s32(hi[5], 31);
+ sign_lo[6] = vshrq_n_s32(lo[6], 31);
+ sign_hi[6] = vshrq_n_s32(hi[6], 31);
+ sign_lo[7] = vshrq_n_s32(lo[7], 31);
+ sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+ if (bit == 2) {
+ const int32x4_t const_rounding = vdupq_n_s32(1);
+ lo[0] = vaddq_s32(lo[0], const_rounding);
+ hi[0] = vaddq_s32(hi[0], const_rounding);
+ lo[1] = vaddq_s32(lo[1], const_rounding);
+ hi[1] = vaddq_s32(hi[1], const_rounding);
+ lo[2] = vaddq_s32(lo[2], const_rounding);
+ hi[2] = vaddq_s32(hi[2], const_rounding);
+ lo[3] = vaddq_s32(lo[3], const_rounding);
+ hi[3] = vaddq_s32(hi[3], const_rounding);
+ lo[4] = vaddq_s32(lo[4], const_rounding);
+ hi[4] = vaddq_s32(hi[4], const_rounding);
+ lo[5] = vaddq_s32(lo[5], const_rounding);
+ hi[5] = vaddq_s32(hi[5], const_rounding);
+ lo[6] = vaddq_s32(lo[6], const_rounding);
+ hi[6] = vaddq_s32(hi[6], const_rounding);
+ lo[7] = vaddq_s32(lo[7], const_rounding);
+ hi[7] = vaddq_s32(hi[7], const_rounding);
+ }
+
+ lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+ hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+ lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+ hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+ lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+ hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+ lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+ hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+ lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+ hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+ lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+ hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+ lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+ hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+ lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+ hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+ if (bit == 1) {
+ lo[0] = vshrq_n_s32(lo[0], 1);
+ hi[0] = vshrq_n_s32(hi[0], 1);
+ lo[1] = vshrq_n_s32(lo[1], 1);
+ hi[1] = vshrq_n_s32(hi[1], 1);
+ lo[2] = vshrq_n_s32(lo[2], 1);
+ hi[2] = vshrq_n_s32(hi[2], 1);
+ lo[3] = vshrq_n_s32(lo[3], 1);
+ hi[3] = vshrq_n_s32(hi[3], 1);
+ lo[4] = vshrq_n_s32(lo[4], 1);
+ hi[4] = vshrq_n_s32(hi[4], 1);
+ lo[5] = vshrq_n_s32(lo[5], 1);
+ hi[5] = vshrq_n_s32(hi[5], 1);
+ lo[6] = vshrq_n_s32(lo[6], 1);
+ hi[6] = vshrq_n_s32(hi[6], 1);
+ lo[7] = vshrq_n_s32(lo[7], 1);
+ hi[7] = vshrq_n_s32(hi[7], 1);
+ } else {
+ lo[0] = vshrq_n_s32(lo[0], 2);
+ hi[0] = vshrq_n_s32(hi[0], 2);
+ lo[1] = vshrq_n_s32(lo[1], 2);
+ hi[1] = vshrq_n_s32(hi[1], 2);
+ lo[2] = vshrq_n_s32(lo[2], 2);
+ hi[2] = vshrq_n_s32(hi[2], 2);
+ lo[3] = vshrq_n_s32(lo[3], 2);
+ hi[3] = vshrq_n_s32(hi[3], 2);
+ lo[4] = vshrq_n_s32(lo[4], 2);
+ hi[4] = vshrq_n_s32(hi[4], 2);
+ lo[5] = vshrq_n_s32(lo[5], 2);
+ hi[5] = vshrq_n_s32(hi[5], 2);
+ lo[6] = vshrq_n_s32(lo[6], 2);
+ hi[6] = vshrq_n_s32(hi[6], 2);
+ lo[7] = vshrq_n_s32(lo[7], 2);
+ hi[7] = vshrq_n_s32(hi[7], 2);
+ }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+ int32x4_t *hi, int stride) {
+ vst1q_s32(output + 0 * stride, lo[0]);
+ vst1q_s32(output + 0 * stride + 4, hi[0]);
+ vst1q_s32(output + 1 * stride, lo[1]);
+ vst1q_s32(output + 1 * stride + 4, hi[1]);
+ vst1q_s32(output + 2 * stride, lo[2]);
+ vst1q_s32(output + 2 * stride + 4, hi[2]);
+ vst1q_s32(output + 3 * stride, lo[3]);
+ vst1q_s32(output + 3 * stride + 4, hi[3]);
+ vst1q_s32(output + 4 * stride, lo[4]);
+ vst1q_s32(output + 4 * stride + 4, hi[4]);
+ vst1q_s32(output + 5 * stride, lo[5]);
+ vst1q_s32(output + 5 * stride + 4, hi[5]);
+ vst1q_s32(output + 6 * stride, lo[6]);
+ vst1q_s32(output + 6 * stride + 4, hi[6]);
+ vst1q_s32(output + 7 * stride, lo[7]);
+ vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+ int32x4_t *hi /*[8]*/) {
+ int32x4_t s_lo[8], s_hi[8];
+ int32x4_t t_lo[8], t_hi[8];
+ int32x4_t x_lo[8], x_hi[8];
+ int64x2_t s64_lo[16], s64_hi[16];
+
+ x_lo[0] = lo[7];
+ x_hi[0] = hi[7];
+ x_lo[1] = lo[0];
+ x_hi[1] = hi[0];
+ x_lo[2] = lo[5];
+ x_hi[2] = hi[5];
+ x_lo[3] = lo[2];
+ x_hi[3] = hi[2];
+ x_lo[4] = lo[3];
+ x_hi[4] = hi[3];
+ x_lo[5] = lo[4];
+ x_hi[5] = hi[4];
+ x_lo[6] = lo[1];
+ x_hi[6] = hi[1];
+ x_lo[7] = lo[6];
+ x_hi[7] = hi[6];
+
+ // stage 1
+ // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+ &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+ // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+ &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+ // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+ // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+ // fdct_round_shift, indices are doubled
+ t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+ t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+ t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+ t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+ t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+ t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+ t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+ t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+ t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+ t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+ t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+ t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+ // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+ // fdct_round_shift
+ // s0 + s2
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s0 - s2
+ t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+ // s1 + s3
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // s1 - s3
+ t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+ // s4 + s6
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s4 - s6
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+ // s5 + s7
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s5 - s7
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+ // stage 3
+ // s2 = cospi_16_64 * (x2 + x3)
+ // s3 = cospi_16_64 * (x2 - x3)
+ butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+ &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+ // s6 = cospi_16_64 * (x6 + x7)
+ // s7 = cospi_16_64 * (x6 - x7)
+ butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+ &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+ // x0, x2, x4, x6 pass through
+ lo[0] = t_lo[0];
+ hi[0] = t_hi[0];
+ lo[2] = s_lo[6];
+ hi[2] = s_hi[6];
+ lo[4] = s_lo[3];
+ hi[4] = s_hi[3];
+ lo[6] = t_lo[5];
+ hi[6] = t_hi[5];
+
+ lo[1] = vnegq_s32(t_lo[4]);
+ hi[1] = vnegq_s32(t_hi[4]);
+ lo[3] = vnegq_s32(s_lo[2]);
+ hi[3] = vnegq_s32(s_hi[2]);
+ lo[5] = vnegq_s32(s_lo[7]);
+ hi[5] = vnegq_s32(s_hi[7]);
+ lo[7] = vnegq_s32(t_lo[1]);
+ hi[7] = vnegq_s32(t_hi[1]);
+
+ transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t lo[8], hi[8];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ highbd_fadst8x8_neon(lo, hi);
+ // pass1 variant is not precise enough
+ vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ // pass1 variant is not precise enough
+ vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_8x8(input, lo, hi, stride);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_fadst8x8_neon(lo, hi);
+ highbd_right_shift_8x8(lo, hi, 1);
+ highbd_write_buffer_8x8(output, lo, hi, 8);
+ break;
+ }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+ const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+ // load first 8 columns
+ highbd_load_buffer_8x8(input, left1, right1, stride);
+ highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ highbd_load_buffer_8x8(input, left2, right2, stride);
+ highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+ tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+ // write first 8 columns
+ highbd_write_buffer_8x8(output, left1, right1, stride);
+ highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+ // write second 8 columns
+ output += 8;
+ highbd_write_buffer_8x8(output, left2, right2, stride);
+ highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+ int32x4_t *right1 /*[16]*/,
+ int32x4_t *left2 /*[16]*/,
+ int32x4_t *right2 /*[16]*/,
+ const int bit) {
+ // perform rounding operations
+ highbd_right_shift_8x8(left1, right1, bit);
+ highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+ highbd_right_shift_8x8(left2, right2, bit);
+ highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+ // perform 16x16 1-D DCT for 8 columns
+ int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+ int32x4_t left8[8], right8[8];
+
+ // stage 1
+ left8[0] = vaddq_s32(left[0], left[15]);
+ right8[0] = vaddq_s32(right[0], right[15]);
+ left8[1] = vaddq_s32(left[1], left[14]);
+ right8[1] = vaddq_s32(right[1], right[14]);
+ left8[2] = vaddq_s32(left[2], left[13]);
+ right8[2] = vaddq_s32(right[2], right[13]);
+ left8[3] = vaddq_s32(left[3], left[12]);
+ right8[3] = vaddq_s32(right[3], right[12]);
+ left8[4] = vaddq_s32(left[4], left[11]);
+ right8[4] = vaddq_s32(right[4], right[11]);
+ left8[5] = vaddq_s32(left[5], left[10]);
+ right8[5] = vaddq_s32(right[5], right[10]);
+ left8[6] = vaddq_s32(left[6], left[9]);
+ right8[6] = vaddq_s32(right[6], right[9]);
+ left8[7] = vaddq_s32(left[7], left[8]);
+ right8[7] = vaddq_s32(right[7], right[8]);
+
+ // step 1
+ s1_lo[0] = vsubq_s32(left[7], left[8]);
+ s1_hi[0] = vsubq_s32(right[7], right[8]);
+ s1_lo[1] = vsubq_s32(left[6], left[9]);
+ s1_hi[1] = vsubq_s32(right[6], right[9]);
+ s1_lo[2] = vsubq_s32(left[5], left[10]);
+ s1_hi[2] = vsubq_s32(right[5], right[10]);
+ s1_lo[3] = vsubq_s32(left[4], left[11]);
+ s1_hi[3] = vsubq_s32(right[4], right[11]);
+ s1_lo[4] = vsubq_s32(left[3], left[12]);
+ s1_hi[4] = vsubq_s32(right[3], right[12]);
+ s1_lo[5] = vsubq_s32(left[2], left[13]);
+ s1_hi[5] = vsubq_s32(right[2], right[13]);
+ s1_lo[6] = vsubq_s32(left[1], left[14]);
+ s1_hi[6] = vsubq_s32(right[1], right[14]);
+ s1_lo[7] = vsubq_s32(left[0], left[15]);
+ s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+ // pass1 variant is not accurate enough
+ vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+ // step 2
+ // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+ // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+ butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+ cospi_16_64, &s2_lo[5], &s2_hi[5],
+ &s2_lo[2], &s2_hi[2]);
+ // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+ // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+ butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+ cospi_16_64, &s2_lo[4], &s2_hi[4],
+ &s2_lo[3], &s2_hi[3]);
+
+ // step 3
+ s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+ s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+ s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+ s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+ s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+ s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+ s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+ s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+ s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+ s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+ s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+ s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+ s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+ s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+ s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+ s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+ // step 4
+ // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+ // s2[6] = cospi_8_64 * s3[6] + cospi_24_64 * s3[1]
+ butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+ cospi_8_64, cospi_24_64, &s2_lo[6],
+ &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+ // s2[5] = cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+ // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+ butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+ cospi_24_64, cospi_8_64, &s2_lo[2],
+ &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+ // step 5
+ s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+ s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+ s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+ s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+ s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+ s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+ s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+ s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+ s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+ s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+ s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+ s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+ s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+ s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+ s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+ s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+ // step 6
+ // out[1] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+ // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+ cospi_2_64, cospi_30_64, &left[1],
+ &right[1], &left[15], &right[15]);
+
+ // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+ // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+ cospi_18_64, cospi_14_64, &left[9],
+ &right[9], &left[7], &right[7]);
+
+ // out[5] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+ // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+ cospi_10_64, cospi_22_64, &left[5],
+ &right[5], &left[11], &right[11]);
+
+ // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+ // out[3] = step1[4] * cospi_6_64 - step1[3] * cospi_26_64
+ butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+ cospi_26_64, cospi_6_64, &left[13],
+ &right[13], &left[3], &right[3]);
+
+ left[0] = left8[0];
+ right[0] = right8[0];
+ left[2] = left8[1];
+ right[2] = right8[1];
+ left[4] = left8[2];
+ right[4] = right8[2];
+ left[6] = left8[3];
+ right[6] = right8[3];
+ left[8] = left8[4];
+ right[8] = right8[4];
+ left[10] = left8[5];
+ right[10] = right8[5];
+ left[12] = left8[6];
+ right[12] = right8[6];
+ left[14] = left8[7];
+ right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+ // perform 16x16 1-D ADST for 8 columns
+ int32x4_t x_lo[16], x_hi[16];
+ int32x4_t s_lo[16], s_hi[16];
+ int32x4_t t_lo[16], t_hi[16];
+ int64x2_t s64_lo[32], s64_hi[32];
+
+ x_lo[0] = left[15];
+ x_hi[0] = right[15];
+ x_lo[1] = left[0];
+ x_hi[1] = right[0];
+ x_lo[2] = left[13];
+ x_hi[2] = right[13];
+ x_lo[3] = left[2];
+ x_hi[3] = right[2];
+ x_lo[4] = left[11];
+ x_hi[4] = right[11];
+ x_lo[5] = left[4];
+ x_hi[5] = right[4];
+ x_lo[6] = left[9];
+ x_hi[6] = right[9];
+ x_lo[7] = left[6];
+ x_hi[7] = right[6];
+ x_lo[8] = left[7];
+ x_hi[8] = right[7];
+ x_lo[9] = left[8];
+ x_hi[9] = right[8];
+ x_lo[10] = left[5];
+ x_hi[10] = right[5];
+ x_lo[11] = left[10];
+ x_hi[11] = right[10];
+ x_lo[12] = left[3];
+ x_hi[12] = right[3];
+ x_lo[13] = left[12];
+ x_hi[13] = right[12];
+ x_lo[14] = left[1];
+ x_hi[14] = right[1];
+ x_lo[15] = left[14];
+ x_hi[15] = right[14];
+
+ // stage 1, indices are doubled
+ // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+ // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+ &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+ // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+ // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+ &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+ // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+ // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+ // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+ // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+ &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+ // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+ // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+ &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+ // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+ // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+ &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+ // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+ // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+ &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+ // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+ // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+ butterfly_two_coeff_s32_s64_noround(
+ x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+ &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+ // fdct_round_shift, indices are doubled
+ t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+ t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+ t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+ t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+ t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+ t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+ t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+ t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+ t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+ t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+ t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+ t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+ t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+ t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+ t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+ t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+ t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+ t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+ t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+ t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+ t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+ t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+ t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+ t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+ // stage 2
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ s_lo[4] = t_lo[4];
+ s_hi[4] = t_hi[4];
+ s_lo[5] = t_lo[5];
+ s_hi[5] = t_hi[5];
+ s_lo[6] = t_lo[6];
+ s_hi[6] = t_hi[6];
+ s_lo[7] = t_lo[7];
+ s_hi[7] = t_hi[7];
+ // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+ &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+ // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+ &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+ // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+ &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+ // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+ &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+ // s0 + s4
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+ // s1 + s5
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+ // s2 + s6
+ t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+ t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+ // s3 + s7
+ t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+ t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+ // s0 - s4
+ t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+ t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+ // s1 - s5
+ t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+ t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+ // s2 - s6
+ t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+ t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+ // s3 - s7
+ t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+ t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+ // fdct_round_shift()
+ // s8 + s12
+ t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+ t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+ // s9 + s13
+ t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+ t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+ // s10 + s14
+ t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+ t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+ // s11 + s15
+ t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+ t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+ // s8 - s12
+ t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+ t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+ // s9 - s13
+ t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+ t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+ // s10 - s14
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+ // s11 - s15
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+ // stage 3
+ s_lo[0] = t_lo[0];
+ s_hi[0] = t_hi[0];
+ s_lo[1] = t_lo[1];
+ s_hi[1] = t_hi[1];
+ s_lo[2] = t_lo[2];
+ s_hi[2] = t_hi[2];
+ s_lo[3] = t_lo[3];
+ s_hi[3] = t_hi[3];
+ // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+ // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+ s_lo[8] = t_lo[8];
+ s_hi[8] = t_hi[8];
+ s_lo[9] = t_lo[9];
+ s_hi[9] = t_hi[9];
+ s_lo[10] = t_lo[10];
+ s_hi[10] = t_hi[10];
+ s_lo[11] = t_lo[11];
+ s_hi[11] = t_hi[11];
+ // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+ &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+ // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+ butterfly_two_coeff_s32_s64_noround(
+ t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+ &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+ // s0 + s2
+ t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s1 + s3
+ t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // s0 - s2
+ t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+ t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+ // s1 - s3
+ t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+ t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+ // fdct_round_shift()
+ // s4 + s6
+ t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s5 + s7
+ t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s4 - s6
+ t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+ t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+ // s5 - s7
+ t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+ t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+ // s8 + s10
+ t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+ t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+ // s9 + s11
+ t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+ t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+ // s8 - s10
+ t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+ t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+ // s9 - s11
+ t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+ t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+ // fdct_round_shift()
+ // s12 + s14
+ t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+ t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+ // s13 + s15
+ t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+ t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+ // s12 - s14
+ t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+ t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+ // s13 - s15
+ t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+ t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+ // stage 4, with fdct_round_shift
+ // s2 = (-cospi_16_64) * (x2 + x3);
+ // s3 = cospi_16_64 * (x2 - x3);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+ -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+ &x_hi[3]);
+ // s6 = cospi_16_64 * (x6 + x7);
+ // s7 = cospi_16_64 * (-x6 + x7);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+ cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+ &x_hi[7]);
+ // s10 = cospi_16_64 * (x10 + x11);
+ // s11 = cospi_16_64 * (-x10 + x11);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+ cospi_16_64, &x_lo[10], &x_hi[10],
+ &x_lo[11], &x_hi[11]);
+ // s14 = (-cospi_16_64) * (x14 + x15);
+ // s15 = cospi_16_64 * (x14 - x15);
+ butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+ -cospi_16_64, &x_lo[14], &x_hi[14],
+ &x_lo[15], &x_hi[15]);
+
+ // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+ x_lo[0] = t_lo[0];
+ x_hi[0] = t_hi[0];
+ x_lo[1] = t_lo[1];
+ x_hi[1] = t_hi[1];
+ x_lo[4] = t_lo[4];
+ x_hi[4] = t_hi[4];
+ x_lo[5] = t_lo[5];
+ x_hi[5] = t_hi[5];
+ x_lo[8] = t_lo[8];
+ x_hi[8] = t_hi[8];
+ x_lo[9] = t_lo[9];
+ x_hi[9] = t_hi[9];
+ x_lo[12] = t_lo[12];
+ x_hi[12] = t_hi[12];
+ x_lo[13] = t_lo[13];
+ x_hi[13] = t_hi[13];
+
+ left[0] = x_lo[0];
+ right[0] = x_hi[0];
+ left[1] = vnegq_s32(x_lo[8]);
+ right[1] = vnegq_s32(x_hi[8]);
+ left[2] = x_lo[12];
+ right[2] = x_hi[12];
+ left[3] = vnegq_s32(x_lo[4]);
+ right[3] = vnegq_s32(x_hi[4]);
+ left[4] = x_lo[6];
+ right[4] = x_hi[6];
+ left[5] = x_lo[14];
+ right[5] = x_hi[14];
+ left[6] = x_lo[10];
+ right[6] = x_hi[10];
+ left[7] = x_lo[2];
+ right[7] = x_hi[2];
+ left[8] = x_lo[3];
+ right[8] = x_hi[3];
+ left[9] = x_lo[11];
+ right[9] = x_hi[11];
+ left[10] = x_lo[15];
+ right[10] = x_hi[15];
+ left[11] = x_lo[7];
+ right[11] = x_hi[7];
+ left[12] = x_lo[5];
+ right[12] = x_hi[5];
+ left[13] = vnegq_s32(x_lo[13]);
+ right[13] = vnegq_s32(x_hi[13]);
+ left[14] = x_lo[9];
+ right[14] = x_hi[9];
+ left[15] = vnegq_s32(x_lo[1]);
+ right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ // Left half.
+ highbd_fdct16_8col(left1, right1);
+ // Right half.
+ highbd_fdct16_8col(left2, right2);
+ transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+ int32x4_t *left2, int32x4_t *right2) {
+ // Left half.
+ highbd_fadst16_8col(left1, right1);
+ // Right half.
+ highbd_fadst16_8col(left2, right2);
+ transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+ case ADST_DCT:
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fdct16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ case DCT_ADST:
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fdct16x16_neon(left1, right1, left2, right2);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+ highbd_fadst16x16_neon(left1, right1, left2, right2);
+ highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+ break;
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
new file mode 100644
index 0000000000..d631cd437d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
+#if VPX_ARCH_AARCH64
+ return vaddlvq_s8(v_sum_diff_total);
+#else
+ const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
+ const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
+ const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
+ const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
+ vget_low_s64(fedcba98_76543210));
+ const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
+ return sum_diff;
+#endif
+}
+
+// Denoise a 16x1 vector.
+static INLINE int8x16_t denoiser_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
+ const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
+ const uint8x16_t v_delta_level_1_and_2,
+ const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+
+ /* Figure out which level that put us in. */
+ const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
+ const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
+ const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
+
+ /* Calculate absolute adjustments for level 1, 2 and 3. */
+ const uint8x16_t v_level2_adjustment =
+ vandq_u8(v_level2_mask, v_delta_level_1_and_2);
+ const uint8x16_t v_level3_adjustment =
+ vandq_u8(v_level3_mask, v_delta_level_2_and_3);
+ const uint8x16_t v_level1and2_adjustment =
+ vaddq_u8(v_level1_adjustment, v_level2_adjustment);
+ const uint8x16_t v_level1and2and3_adjustment =
+ vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
+
+ /* Figure adjustment absolute value by selecting between the absolute
+ * difference if in level0 or the value for level 1, 2 and 3.
+ */
+ const uint8x16_t v_abs_adjustment =
+ vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
+
+ /* Calculate positive and negative adjustments. Apply them to the signal
+ * and accumulate them. Adjustments are less than eight and the maximum
+ * sum of them (7 * 16) can fit in a signed char.
+ */
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ /* Sum all the accumulators to have the sum of all pixel differences
+ * for this macroblock.
+ */
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
+ vreinterpretq_s8_u8(v_neg_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+static INLINE int8x16_t denoiser_adjust_16x1_neon(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
+ uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
+ const uint8x16_t v_sig = vld1q_u8(sig);
+ const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
+
+ /* Calculate absolute difference and sign masks. */
+ const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
+ const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
+ // Clamp absolute difference to delta to get the adjustment.
+ const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
+
+ const uint8x16_t v_pos_adjustment =
+ vandq_u8(v_diff_pos_mask, v_abs_adjustment);
+ const uint8x16_t v_neg_adjustment =
+ vandq_u8(v_diff_neg_mask, v_abs_adjustment);
+
+ v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
+ v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
+
+ /* Store results. */
+ vst1q_u8(running_avg_y, v_running_avg_y);
+
+ {
+ const int8x16_t v_sum_diff =
+ vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
+ vreinterpretq_s8_u8(v_pos_adjustment));
+ v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
+ }
+ return v_sum_diff_total;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude,
+ int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+ int8x16_t v_sum_diff_total = vdupq_n_s8(0);
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ v_sum_diff_total = denoiser_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+ v_level1_threshold, v_level2_threshold, v_level3_threshold,
+ v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vmovq_n_u8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ v_sum_diff_total = denoiser_adjust_16x1_neon(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
+ v_sum_diff_total);
+ {
+ const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
+ const uint8x8_t v_running_buffer_high =
+ vget_high_u8(v_running_buffer);
+ const uint8x8_t v_running_buffer_low =
+ vget_low_u8(v_running_buffer);
+ vst1_u8(running_avg_y, v_running_buffer_low);
+ vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
+ }
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = horizontal_add_s8x16(v_sum_diff_total);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride, uint8_t *running_avg_y,
+ int avg_y_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ const uint8x16_t v_level1_adjustment = vmovq_n_u8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
+ const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
+ const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
+ const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
+ const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
+ const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
+
+ const int b_width = (4 << b_width_log2_lookup[bs]);
+ const int b_height = (4 << b_height_log2_lookup[bs]);
+ const int b_width_shift4 = b_width >> 4;
+
+ int8x16_t v_sum_diff_total[4][4];
+ int r, c, sum_diff = 0;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r] = vdupq_n_s8(0);
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
+ sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
+ v_level2_threshold, v_level3_threshold, v_level1_adjustment,
+ v_delta_level_1_and_2, v_delta_level_2_and_3,
+ v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const uint8x16_t k_delta = vdupq_n_u8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ v_sum_diff_total[c][r >> 4] =
+ denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
+ k_delta, v_sum_diff_total[c][r >> 4]);
+
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
+ }
+ }
+
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
+ avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ }
+ return COPY_BLOCK;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 0000000000..b82b3f9db5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+ int_mv result;
+ result.as_mv.row = row;
+ result.as_mv.col = col;
+ return result;
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables, *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
+ * vp9_encoder.c. *
+ * For the joint cost: *
+ * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
+ * For the component costs: *
+ * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
+ * (Equal costs for both components) *
+ * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
+ * (Cost function is even) *
+ * If these do not hold, then this function cannot be used without *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties. *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+ const search_site_config *cfg, MV *ref_mv,
+ uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
+ const MV *center_mv) {
+ static const uint32_t data[4] = { 0, 1, 2, 3 };
+ const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+ const int32x4_t zero_s32 = vdupq_n_s32(0);
+ const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+ const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+ const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+ const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+ const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+ const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+ const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+ const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+ const int tot_steps = cfg->total_steps - search_param;
+
+ const int_mv fcenter_mv =
+ pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+ const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
+
+ const int ref_row = ref_mv->row;
+ const int ref_col = ref_mv->col;
+
+ int_mv bmv = pack_int_mv(ref_row, ref_col);
+ int_mv new_bmv = bmv;
+ int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+ const int what_stride = x->plane[0].src.stride;
+ const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+ const uint8_t *const what = x->plane[0].src.buf;
+ const uint8_t *const in_what =
+ x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+ // Work out the start point for the search
+ const uint8_t *best_address = in_what;
+ const uint8_t *new_best_address = best_address;
+#if VPX_ARCH_AARCH64
+ int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+ int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+ // Starting position
+ unsigned int best_sad = start_mv_sad;
+ int i, j, step;
+
+ // Check the prerequisite cost function properties that are easy to check
+ // in an assert. See the function-level documentation for details on all
+ // prerequisites.
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+ *num00 = 0;
+
+ for (i = 0, step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+ int16x8_t v_diff_mv_w;
+ int8x16_t v_inside_d;
+ uint32x4_t v_outside_d;
+ int32x4_t v_cost_d, v_sad_d;
+#if VPX_ARCH_AARCH64
+ int64x2_t v_blocka[2];
+#else
+ int32x4_t v_blocka[1];
+ uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+ uint32_t horiz_max;
+ // Compute the candidate motion vectors
+ const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+ const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+ // Clamp them to the search bounds
+ int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+ v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+ v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+ // The ones that did not change are inside the search area
+ v_inside_d = vreinterpretq_s8_u32(
+ vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+ vreinterpretq_s32_s16(v_these_mv_w)));
+
+ // If none of them are inside, then move on
+#if VPX_ARCH_AARCH64
+ horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+ horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+ vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+ horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+ vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+ if (LIKELY(horiz_max == 0)) {
+ continue;
+ }
+
+ // The inverse mask indicates which of the MVs are outside
+ v_outside_d =
+ vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+ // Shift right to keep the sign bit clear, we will use this later
+ // to set the cost to the maximum value.
+ v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+ // Compute the difference MV
+ v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+ // We utilise the fact that the cost function is even, and use the
+ // absolute difference. This allows us to use unsigned indexes later
+ // and reduces cache pressure somewhat as only a half of the table
+ // is ever referenced.
+ v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+ // Compute the SIMD pointer offsets.
+ {
+#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8
+ // Load the offsets
+ int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+ int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+ // Set the ones falling outside to zero
+ v_bo10_q = vandq_s64(
+ v_bo10_q,
+ vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+ v_bo32_q = vandq_s64(
+ v_bo32_q,
+ vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+ // Compute the candidate addresses
+ v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+ v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else // sizeof(intptr_t) == 4
+ int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+ v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+ v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+ }
+
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
+
+ // Look up the component cost of the residual motion vector
+ {
+ uint32_t cost[4];
+ DECLARE_ALIGNED(16, int16_t, rowcol[8]);
+ vst1q_s16(rowcol, v_diff_mv_w);
+
+ // Note: This is a use case for gather instruction
+ cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+ cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+ cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+ cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+ v_cost_d = vld1q_s32((int32_t *)cost);
+ }
+
+ // Now add in the joint cost
+ {
+ const uint32x4_t v_sel_d =
+ vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+ const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+ vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+ vreinterpretq_u8_s32(v_joint_cost_0_d),
+ vreinterpretq_u8_s32(v_joint_cost_1_d)));
+ v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+ }
+
+ // Multiply by sad_per_bit
+ v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+ // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+ v_cost_d =
+ vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+ v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+ // Add the cost to the sad
+ v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+ // Make the motion vectors outside the search area have max cost
+ // by or'ing in the comparison mask, this way the minimum search won't
+ // pick them.
+ v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+ // Find the minimum value and index horizontally in v_sad_d
+ {
+ uint32_t local_best_sad;
+#if VPX_ARCH_AARCH64
+ local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+ uint32x2_t horiz_min_0 =
+ vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+ vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+ uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+ vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+ // Update the global minimum if the local minimum is smaller
+ if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+ uint32_t local_best_idx;
+ const uint32x4_t v_sel_d =
+ vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+ uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+ v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if VPX_ARCH_AARCH64
+ local_best_idx = vminvq_u32(v_mask_d);
+#else
+ horiz_min_0 =
+ vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+ horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+ vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+ new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+ new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+ best_sad = local_best_sad;
+ }
+ }
+ }
+
+ bmv = new_bmv;
+ best_address = new_best_address;
+
+ v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if VPX_ARCH_AARCH64
+ v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+ v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+ if (UNLIKELY(best_address == in_what)) {
+ (*num00)++;
+ }
+ }
+
+ *best_mv = bmv.as_mv;
+ return best_sad;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
new file mode 100644
index 0000000000..0cf0bf250e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ uint32x4_t err;
+ int32x4_t ssz0, ssz1;
+
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
+ // accumulating them in 64-bits.
+ err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+ err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+ err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64 = vpadalq_u32(err_u64, err);
+
+ // We can't do the same here as we're operating on signed integers, so we
+ // can store 2 15-bit diff before accumulating into 64-bits.
+ ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+ ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+ ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+ ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+ ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ *ssz = horizontal_add_int64x2(ssz_s64);
+ return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ uint32x4_t err0, err1;
+
+ const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+ const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+ const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+ const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+ const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+ // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+ // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+ // pairs is beneficial since it allows us to use both Neon
+ // multiply-accumulate pipes - on CPUs that have them - rather than having
+ // a single chain of 4 instructions executing serially.
+ err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+ err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+ err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+ err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+ err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+ err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+ coeff += 16;
+ dqcoeff += 16;
+ block_size -= 16;
+ } while (block_size != 0);
+
+ return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
new file mode 100644
index 0000000000..bc8dd4a341
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+// Note: The scaling functions could write extra rows and columns in dst, which
+// exceed the right and bottom boundaries of the destination frame. We rely on
+// the following frame extension function to fix these rows and columns.
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x2_t s = vld2q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const int src_stride,
+ uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h) {
+ const int max_width = (w + 15) & ~15;
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ const uint8x16x4_t s = vld4q_u8(src);
+ vst1q_u8(dst, s.val[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+ const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+ const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+ uint8_t *const dst) {
+ const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+ const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+ const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+ const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+ const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+ const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+ const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+ const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+ const uint8x8_t hor0 = vrshrn_n_u16(h4, 7); // temp: 00 01 02 03 04 05 06 07
+ const uint8x8_t hor1 = vrshrn_n_u16(h5, 7); // temp: 08 09 0A 0B 0C 0D 0E 0F
+ const uint8x8_t hor2 = vrshrn_n_u16(h6, 7); // temp: 10 11 12 13 14 15 16 17
+ const uint8x8_t hor3 = vrshrn_n_u16(h7, 7); // temp: 18 19 1A 1B 1C 1D 1E 1F
+ const uint16x8_t v0 = vmull_u8(hor0, coef0);
+ const uint16x8_t v1 = vmull_u8(hor1, coef0);
+ const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+ const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+ // dst: 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+ vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // 000 002 004 006 008 00A 00C 00E 010 012 014 016 018 01A 01C 01E
+ // 001 003 005 007 009 00B 00D 00F 011 013 015 017 019 01B 01D 01F
+ const uint8x16x2_t s0 = vld2q_u8(src0);
+ // 100 102 104 106 108 10A 10C 10E 110 112 114 116 118 11A 11C 11E
+ // 101 103 105 107 109 10B 10D 10F 111 113 115 117 119 11B 11D 11F
+ const uint8x16x2_t s1 = vld2q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 32;
+ src1 += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 2 * (src_stride - max_width);
+ src1 += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+ const uint8_t *const src, const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w, const int h, const int16_t c0,
+ const int16_t c1) {
+ const int max_width = (w + 15) & ~15;
+ const uint8_t *src0 = src;
+ const uint8_t *src1 = src + src_stride;
+ const uint8x8_t coef0 = vdup_n_u8(c0);
+ const uint8x8_t coef1 = vdup_n_u8(c1);
+ int y = h;
+
+ assert(w && h);
+
+ do {
+ int x = max_width;
+ do {
+ // (*) -- useless
+ // 000 004 008 00C 010 014 018 01C 020 024 028 02C 030 034 038 03C
+ // 001 005 009 00D 011 015 019 01D 021 025 029 02D 031 035 039 03D
+ // 002 006 00A 00E 012 016 01A 01E 022 026 02A 02E 032 036 03A 03E (*)
+ // 003 007 00B 00F 013 017 01B 01F 023 027 02B 02F 033 037 03B 03F (*)
+ const uint8x16x4_t s0 = vld4q_u8(src0);
+ // 100 104 108 10C 110 114 118 11C 120 124 128 12C 130 134 138 13C
+ // 101 105 109 10D 111 115 119 11D 121 125 129 12D 131 135 139 13D
+ // 102 106 10A 10E 112 116 11A 11E 122 126 12A 12E 132 136 13A 13E (*)
+ // 103 107 10B 10F 113 117 11B 11F 123 127 12B 12F 133 137 13B 13F (*)
+ const uint8x16x4_t s1 = vld4q_u8(src1);
+ scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+ coef0, coef1, dst);
+ src0 += 64;
+ src1 += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src0 += 4 * (src_stride - max_width);
+ src1 += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+ const uint8x8_t *const coef) {
+ const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+ const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+ return vrshrn_n_u16(h1, 7);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[14], d[4];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ // Note: processing 4x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[2], filters); // 01 11 21 31 41 51 61 71
+ d[2] = scale_filter_8(&s[4], filters); // 02 12 22 32 42 52 62 72
+ d[3] = scale_filter_8(&s[6], filters); // 03 13 23 33 43 53 63 73
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+ vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+ 0);
+ vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+ 1);
+ vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+ 1);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ t += 4;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+ &s[12], &s[13]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[2], filters); // 10 11 12 13 14 15 16 17
+ d[2] = scale_filter_8(&s[4], filters); // 20 21 22 23 24 25 26 27
+ d[3] = scale_filter_8(&s[6], filters); // 30 31 32 33 34 35 36 37
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ const int16x8_t filters = vld1q_s16(coef);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[12], d[2];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ // Note: processing 2x8 is about 20% faster than processing row by row using
+ // vld4_u8().
+ do {
+ load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+ x = width_hor;
+
+ do {
+ uint8x8x2_t dd;
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+ &s[11]);
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 10 20 30 40 50 60 70
+ d[1] = scale_filter_8(&s[4], filters); // 01 11 21 31 41 51 61 71
+ // dd.val[0]: 00 01 20 21 40 41 60 61
+ // dd.val[1]: 10 11 30 31 50 51 70 71
+ dd = vtrn_u8(d[0], d[1]);
+ vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 0);
+ vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 0);
+ vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 1);
+ vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 1);
+ vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 2);
+ vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 2);
+ vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+ vreinterpret_u16_u8(dd.val[0]), 3);
+ vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+ vreinterpret_u16_u8(dd.val[1]), 3);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ t += 2;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 7 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+ &s[10], &s[11]);
+ t += 8 * width_hor;
+
+ d[0] = scale_filter_8(&s[0], filters); // 00 01 02 03 04 05 06 07
+ d[1] = scale_filter_8(&s[4], filters); // 10 11 12 13 14 15 16 17
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+ const int src_stride, uint8_t *dst,
+ const int dst_stride, const int w,
+ const int h, const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We only need 1 extra row below because there are only 2 bilinear
+ // coefficients.
+ const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[9], d[8], c[6];
+
+ assert(w && h);
+
+ c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
+ c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
+ c[2] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+ SUBPEL_MASK][3]);
+ c[3] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+ SUBPEL_MASK][4]);
+ c[4] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+ SUBPEL_MASK][3]);
+ c[5] = vdup_n_u8(
+ (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+ SUBPEL_MASK][4]);
+
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ src += 1;
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ src += 8;
+ transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3 - 1;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7], &s[8]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_bilinear(&s[0], &c[0]);
+ d[1] =
+ scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+ d[2] =
+ scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+ d[3] = scale_filter_bilinear(&s[4], &c[0]);
+ d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+ &c[2]);
+ d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+ &c[4]);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 1);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = width_hor + 2; // store 2 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ const int16x8_t filters0 =
+ vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters1 =
+ vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+ const int16x8_t filters2 =
+ vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ uint8x8_t s[15], d[8];
+
+ assert(w && h);
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+ d[6] = vdup_n_u8(0);
+ d[7] = vdup_n_u8(0);
+
+ // horizontal 6x8
+ do {
+ load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+ &s[6], &s[7]);
+ transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+ &s[14]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+ // 60 61 62 63 64 65 xx xx
+ // 70 71 72 73 74 75 xx xx
+ transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ // store 2 extra pixels
+ vst1_u8(t + 0 * stride_hor, d[0]);
+ vst1_u8(t + 1 * stride_hor, d[1]);
+ vst1_u8(t + 2 * stride_hor, d[2]);
+ vst1_u8(t + 3 * stride_hor, d[3]);
+ vst1_u8(t + 4 * stride_hor, d[4]);
+ vst1_u8(t + 5 * stride_hor, d[5]);
+ vst1_u8(t + 6 * stride_hor, d[6]);
+ vst1_u8(t + 7 * stride_hor, d[7]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ t += 6;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 7 * stride_hor + 2;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+ &s[7]);
+ t += 7 * stride_hor;
+ y = height_ver;
+
+ do {
+ load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+ &s[13], &s[14]);
+ t += 8 * stride_hor;
+
+ d[0] = scale_filter_8(&s[0], filters0);
+ d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+ d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+ d[3] = scale_filter_8(&s[4], filters0);
+ d[4] =
+ scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+ d[5] =
+ scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+ vst1_u8(dst + 0 * dst_stride, d[0]);
+ vst1_u8(dst + 1 * dst_stride, d[1]);
+ vst1_u8(dst + 2 * dst_stride, d[2]);
+ vst1_u8(dst + 3 * dst_stride, d[3]);
+ vst1_u8(dst + 4 * dst_stride, d[4]);
+ vst1_u8(dst + 5 * dst_stride, d[5]);
+
+ s[0] = s[8];
+ s[1] = s[9];
+ s[2] = s[10];
+ s[3] = s[11];
+ s[4] = s[12];
+ s[5] = s[13];
+ s[6] = s[14];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * (4 * height_ver / 3 + 7);
+ t += 8;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ INTERP_FILTER filter_type,
+ int phase_scaler) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ const int dst_uv_w = dst->uv_crop_width;
+ const int dst_uv_h = dst->uv_crop_height;
+ int scaled = 0;
+
+ // phase_scaler is usually 0 or 8.
+ assert(phase_scaler >= 0 && phase_scaler < 16);
+
+ if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+ // 2 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0, c1);
+ scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_w + 3) & ~3;
+ const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_2_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_2_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_2_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0, c1);
+ scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+ } else {
+ const int buffer_stride = (dst_w + 1) & ~1;
+ const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_4_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_4_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_4_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+ const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scaled = 1;
+ if (filter_type == BILINEAR) {
+ scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, phase_scaler,
+ temp_buffer);
+ scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+ dst->u_buffer, dst->uv_stride, dst_uv_w,
+ dst_uv_h, phase_scaler, temp_buffer);
+ scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+ dst->v_buffer, dst->uv_stride, dst_uv_w,
+ dst_uv_h, phase_scaler, temp_buffer);
+ } else {
+ scale_plane_4_to_3_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type],
+ phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type],
+ phase_scaler, temp_buffer);
+ }
+ free(temp_buffer);
+ }
+ }
+
+ if (scaled) {
+ vpx_extend_frame_borders(dst);
+ } else {
+ // Call c version for all other scaling ratios.
+ vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 0000000000..d9b183472d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ uint64x2_t err_u64 = vdupq_n_u64(0);
+ int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ assert(block_size >= 16);
+ assert((block_size % 16) == 0);
+
+ do {
+ const int32x4_t c = load_tran_low_to_s32q(coeff);
+ const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+ const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+ err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+ err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+ ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+ ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+ coeff += 4;
+ dqcoeff += 4;
+ block_size -= 4;
+ } while (block_size != 0);
+
+ *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+ return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 0000000000..c3aef3c865
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,872 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+ uint32_t *dst) {
+ const uint16x8_t a_reg = vld1q_u16(a);
+ const uint16x8_t b_reg = vld1q_u16(b);
+
+ uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+ uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+ uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+ vst1q_u32(dst, dist_first);
+ vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+ uint32x4_t dist_reg, dist_left, dist_right;
+
+ dist_reg = vld1q_u32(dist);
+ dist_left = vld1q_u32(dist - 1);
+ dist_right = vld1q_u32(dist + 1);
+
+ *sum = vaddq_u32(dist_reg, dist_left);
+ *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+ uint32x4_t *sum_second) {
+ highbd_get_sum_4(dist, sum_first);
+ highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+ const uint32x4_t *mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+ const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+ const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+ const uint32x4_t sixteen = vdupq_n_u32(16);
+ uint32x4_t sum2;
+
+ // modifier * 3 / index;
+ uint64x2_t sum_lo =
+ vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+ uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+ vget_high_u32(*mul_constants));
+
+ // we cannot use vshrn_n_u64 as strength is not known at compile time.
+ sum_lo = vshlq_u64(sum_lo, strength_s64);
+ sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+ sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+ // Multiply with the weight
+ sum2 = vminq_u32(sum2, sixteen);
+ sum2 = vsubq_u32(sixteen, sum2);
+ *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+ const uint32x4_t sum_0_u32,
+ const uint32x4_t sum_1_u32,
+ const uint32x4_t *mul_constants_0,
+ const uint32x4_t *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
+ highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+ weight);
+ highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+ weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+ const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+ const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+ const uint16x8_t sum_u16 =
+ vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+ uint16x8_t pred_u16 = vld1q_u16(pred);
+ uint16x8_t count_u16 = vld1q_u16(count);
+ uint32x4_t pred_0_u32, pred_1_u32;
+ uint32x4_t accum_0_u32, accum_1_u32;
+
+ count_u16 = vqaddq_u16(count_u16, sum_u16);
+ vst1q_u16(count, count_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+
+ pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+ pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+ // Don't use sum_u16 as that produces different results to the C version
+ accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+ accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+ uint32x4_t *dist_reg) {
+ *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+ uint32x4_t *reg_first,
+ uint32x4_t *reg_second) {
+ highbd_read_dist_4(dist, reg_first);
+ highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+ int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+ uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+ uint32x4_t *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 8 entries from chroma.
+ highbd_read_dist_8(u_dist, u_first, u_second);
+ highbd_read_dist_8(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ uint32x4_t u_reg, v_reg;
+ uint32x4x2_t pair;
+
+ highbd_read_dist_4(u_dist, &u_reg);
+
+ pair = vzipq_u32(u_reg, u_reg);
+ *u_first = pair.val[0];
+ *u_second = pair.val[1];
+
+ highbd_read_dist_4(v_dist, &v_reg);
+
+ pair = vzipq_u32(v_reg, v_reg);
+ *v_first = pair.val[0];
+ *v_second = pair.val[1];
+ }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_first,
+ const uint32_t *const *neighbors_second, int top_weight,
+ int bottom_weight) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ uint32x4_t mul_first, mul_second;
+
+ uint32x4_t sum_row_1_first, sum_row_1_second;
+ uint32x4_t sum_row_2_first, sum_row_2_second;
+ uint32x4_t sum_row_3_first, sum_row_3_second;
+
+ uint32x4_t u_first, u_second;
+ uint32x4_t v_first, v_second;
+
+ uint32x4_t sum_row_first;
+ uint32x4_t sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(block_width == 8);
+
+ (void)block_width;
+
+ // First row
+ mul_first = vld1q_u32(neighbors_first[0]);
+ mul_second = vld1q_u32(neighbors_second[0]);
+
+ // Add luma values
+ highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+ // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+ sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+ sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = vld1q_u32(neighbors_first[1]);
+ mul_second = vld1q_u32(neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ weight = bottom_weight;
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+ sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength,
+ rounding, weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = vld1q_u32(neighbors_first[0]);
+ mul_second = vld1q_u32(neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ }
+
+ sum_row_first = vaddq_u32(sum_row_first, u_first);
+ sum_row_second = vaddq_u32(sum_row_second, u_second);
+ sum_row_first = vaddq_u32(sum_row_first, v_first);
+ sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+ sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_first;
+ const uint32_t *const *neighbors_second;
+
+ // Left
+ neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ // Right
+ neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+ highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+ const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+ uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+ uint32x4_t y_reg_fst, y_reg_snd;
+ if (!ss_x) {
+ highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+ y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+ y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+ }
+ } else {
+ // Temporary
+ uint32x4_t y_fst, y_snd;
+ uint64x2_t y_fst64, y_snd64;
+
+ // First 8
+ highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = vaddq_u32(y_fst, y_tmp_fst);
+ y_snd = vaddq_u32(y_snd, y_tmp_snd);
+ }
+
+ y_fst64 = vpaddlq_u32(y_fst);
+ y_snd64 = vpaddlq_u32(y_snd);
+ y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+ // Second 8
+ highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ uint32x4_t y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = vaddq_u32(y_fst, y_tmp_fst);
+ y_snd = vaddq_u32(y_snd, y_tmp_snd);
+ }
+
+ y_fst64 = vpaddlq_u32(y_fst);
+ y_snd64 = vpaddlq_u32(y_snd);
+ y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+ }
+
+ *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+ *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+ *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+ *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+ int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+ const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+ int top_weight, int bottom_weight, const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ uint32x4_t mul_fst, mul_snd;
+
+ uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+ uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+ uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+ uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+ uint32x4_t u_sum_row_fst, v_sum_row_fst;
+ uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+ // Loop variable
+ unsigned int h;
+
+ (void)uv_block_width;
+
+ // First row
+ mul_fst = vld1q_u32(neighbors_fst[0]);
+ mul_snd = vld1q_u32(neighbors_snd[0]);
+
+ // Add chroma values
+ highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+ u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+ highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+ v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul_fst = vld1q_u32(neighbors_fst[1]);
+ mul_snd = vld1q_u32(neighbors_snd[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+ u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+ v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+ v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+ v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul_fst = vld1q_u32(neighbors_fst[0]);
+ mul_snd = vld1q_u32(neighbors_snd[0]);
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+ v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+ u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+ v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+ u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+ v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_fst;
+ const uint32_t *const *neighbors_snd;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ } else {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+ uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+ highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw,
+ use_whole_blk, y_accum, y_count, y_dist_ptr,
+ u_dist_ptr, v_dist_ptr);
+
+ highbd_apply_temporal_filter_chroma(
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 0000000000..97ab13628e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+ const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
+ const int32x4_t dqcoeff_0 =
+ vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+ const int32x4_t dqcoeff_1 =
+ vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vst1q_s32(dqcoeff, dqcoeff_0);
+ vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+ vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+ int16x8_t v_eobmax,
+ uint16x8_t v_nz_mask) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
+ return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if VPX_ARCH_AARCH64
+ return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif // VPX_ARCH_AARCH64
+}
+
+static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *dequant_ptr,
+ int16x8_t *round, int16x8_t *quant,
+ int16x8_t *dequant) {
+ *round = vld1q_s16(round_ptr);
+ *quant = vld1q_s16(quant_ptr);
+ *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+ int16x8_t *v_quant,
+ int16x8_t *v_dequant) {
+#if VPX_ARCH_AARCH64
+ *v_round = vdupq_laneq_s16(*v_round, 1);
+ *v_quant = vdupq_laneq_s16(*v_quant, 1);
+ *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+ *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+ *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+ *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+ const int16x8_t *v_round, const int16x8_t *v_quant,
+ const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ int16x8_t *v_eobmax) {
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+ *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ int i;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+ int16x8_t v_round, v_quant, v_dequant;
+ (void)scan;
+
+ load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
+ &v_dequant);
+ // process dc and the first seven ac coeffs
+ quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+ dqcoeff_ptr, &v_eobmax);
+
+ // now process the rest of the ac coeffs
+ update_fp_values(&v_round, &v_quant, &v_dequant);
+ for (i = 8; i < count; i += 8) {
+ quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+ qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
+ }
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+ const int16x8_t *v_round, const int16x8_t *v_quant,
+ const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+ const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+ const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+ const int16x8_t v_thr_mask =
+ vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+ const int16x8_t v_tmp_rnd =
+ vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+ const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+ const int16x8_t v_qcoeff =
+ vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
+
+ int32x4_t dqcoeff_0, dqcoeff_1;
+ dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+ dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+ dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
+ vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
+#else
+ store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1),
+ vshrn_n_s32(dqcoeff_1, 1)));
+#endif
+
+ store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+ *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int16x8_t eob_max = vdupq_n_s16(-1);
+ // ROUND_POWER_OF_TWO(round_ptr[], 1)
+ int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+ int16x8_t quant = vld1q_s16(quant_ptr);
+ int16x8_t dequant = vld1q_s16(dequant_ptr);
+ // dequant >> 2 is used similar to zbin as a threshold.
+ int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+ int i;
+
+ (void)scan;
+ (void)count;
+
+ // Process dc and the first seven ac coeffs.
+ quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+ iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+ update_fp_values(&round, &quant, &dequant);
+ dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
+
+ iscan += 8;
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+
+ // Process the rest of the ac coeffs.
+ for (i = 8; i < 32 * 32; i += 8) {
+ quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+ iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+ iscan += 8;
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+ // const int abs_qcoeff = (int)((tmp * quant) >> 16);
+ const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Packed nz_qcoeff_mask. Used to find eob.
+ return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_round = vld1_s16(round_ptr);
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ (void)scan;
+
+ // DC and first 3 AC
+ v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi =
+ highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ n_coeffs -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ v_mask_hi =
+ highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ n_coeffs -= 8;
+ } while (n_coeffs);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+ int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+ const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+ const int32x4_t v_coeff_sign =
+ vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+ const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+ // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+ const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+ const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+ // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+ vreinterpretq_s32_u32(v_mask));
+ // const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+ const int32x4_t v_abs_qcoeff =
+ vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+ // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_qcoeff =
+ vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+ // vshlq_s32 will shift right if shift value is negative.
+ const int32x4_t v_abs_dqcoeff =
+ vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+ // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ const int32x4_t v_dqcoeff =
+ vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+ vst1q_s32(qcoeff_ptr, v_qcoeff);
+ vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+ // Packed nz_qcoeff_mask. Used to find eob.
+ return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
+
+void vp9_highbd_quantize_fp_32x32_neon(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int16x4_t v_quant = vld1_s16(quant_ptr);
+ const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+ const int16x4_t v_zero = vdup_n_s16(0);
+ const int16x4_t v_round =
+ vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+ int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+ int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+ int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+ uint16x4_t v_mask_lo, v_mask_hi;
+ int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+ (void)scan;
+
+ // DC and first 3 AC
+ v_mask_lo =
+ highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // overwrite the DC constants with AC constants
+ v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+ v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+ v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+ // 4 more AC
+ v_mask_hi =
+ highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+
+ // Find the max lane eob for the first 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+ n_coeffs -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan += 8;
+ v_mask_lo =
+ highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ v_quant_s32, v_dequant_s32, v_round_s32);
+ v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+ dqcoeff_ptr + 4, v_quant_s32,
+ v_dequant_s32, v_round_s32);
+ // Find the max lane eob for 8 coeffs.
+ v_eobmax =
+ get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+ n_coeffs -= 8;
+ } while (n_coeffs);
+
+ *eob_ptr = get_max_eob(v_eobmax);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 0000000000..a651a15d90
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,849 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const uint8x8_t a_reg = vld1_u8(a);
+ const uint8x8_t b_reg = vld1_u8(b);
+
+ uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+ dist_first = vmulq_u16(dist_first, dist_first);
+
+ vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const uint8x16_t a_reg = vld1q_u8(a);
+ const uint8x16_t b_reg = vld1q_u8(b);
+
+ uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+ uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+ dist_first = vmulq_u16(dist_first, dist_first);
+ dist_second = vmulq_u16(dist_second, dist_second);
+
+ vst1q_u16(dst, dist_first);
+ vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+ *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+ uint16x8_t *reg_second) {
+ read_dist_8(dist, reg_first);
+ read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+ const uint16x8_t *mul_constants,
+ const int strength, const int rounding,
+ const uint16x8_t *weight) {
+ const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+ const uint16x8_t weight_u16 = *weight;
+ const uint16x8_t sixteen = vdupq_n_u16(16);
+ const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+ // modifier * 3 / index;
+ uint32x4_t sum_hi =
+ vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+ uint32x4_t sum_lo =
+ vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+ sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+ sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+ // we cannot use vshrn_n_u32 as strength is not known at compile time.
+ sum_lo = vshlq_u32(sum_lo, strength_u32);
+ sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+ sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = vminq_u16(sum, sixteen);
+ sum = vsubq_u16(sixteen, sum);
+ return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+ uint16x8_t count_u16 = vld1q_u16(count);
+ uint32x4_t accum_0_u32, accum_1_u32;
+
+ count_u16 = vqaddq_u16(count_u16, sum_u16);
+ vst1q_u16(count, count_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+
+ accum_0_u32 =
+ vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+ accum_1_u32 =
+ vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+ const uint16x8_t sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ uint8x16_t pred_u8 = vld1q_u8(pred);
+ uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+ uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+ uint16x8_t count_0_u16 = vld1q_u16(count);
+ uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+ uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+ vst1q_u16(count, count_0_u16);
+ count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+ vst1q_u16(count + 8, count_1_u16);
+
+ accum_0_u32 = vld1q_u32(accumulator);
+ accum_1_u32 = vld1q_u32(accumulator + 4);
+ accum_2_u32 = vld1q_u32(accumulator + 8);
+ accum_3_u32 = vld1q_u32(accumulator + 12);
+
+ accum_0_u32 =
+ vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+ accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+ vget_high_u16(pred_0_u16));
+ accum_2_u32 =
+ vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+ accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+ vget_high_u16(pred_1_u16));
+
+ vst1q_u32(accumulator, accum_0_u32);
+ vst1q_u32(accumulator + 4, accum_1_u32);
+ vst1q_u32(accumulator + 8, accum_2_u32);
+ vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+ uint16x8_t dist_reg, dist_left, dist_right;
+
+ dist_reg = vld1q_u16(y_dist);
+ dist_left = vld1q_u16(y_dist - 1);
+ dist_right = vld1q_u16(y_dist + 1);
+
+ *sum = vqaddq_u16(dist_reg, dist_left);
+ *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+ uint16x8_t *sum_second) {
+ get_sum_8(y_dist, sum_first);
+ get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+ const uint16_t *v_dist,
+ uint16x8_t *u_first,
+ uint16x8_t *u_second,
+ uint16x8_t *v_first,
+ uint16x8_t *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 16 entries from chroma.
+ read_dist_16(u_dist, u_first, u_second);
+ read_dist_16(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ uint16x8_t u_reg, v_reg;
+ uint16x8x2_t pair;
+
+ read_dist_8(u_dist, &u_reg);
+
+ pair = vzipq_u16(u_reg, u_reg);
+ *u_first = pair.val[0];
+ *u_second = pair.val[1];
+
+ read_dist_8(v_dist, &v_reg);
+
+ pair = vzipq_u16(v_reg, v_reg);
+ *v_first = pair.val[0];
+ *v_second = pair.val[1];
+ }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+ int ss_x, int ss_y,
+ uint16x8_t *u_mod,
+ uint16x8_t *v_mod) {
+ uint16x8_t y_reg;
+ if (!ss_x) {
+ read_dist_8(y_dist, &y_reg);
+ if (ss_y == 1) {
+ uint16x8_t y_tmp;
+ read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+ y_reg = vqaddq_u16(y_reg, y_tmp);
+ }
+ } else {
+ uint16x8_t y_first, y_second;
+ uint32x4_t y_first32, y_second32;
+
+ read_dist_16(y_dist, &y_first, &y_second);
+ if (ss_y == 1) {
+ uint16x8_t y_tmp_0, y_tmp_1;
+ read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+ y_first = vqaddq_u16(y_first, y_tmp_0);
+ y_second = vqaddq_u16(y_second, y_tmp_1);
+ }
+
+ y_first32 = vpaddlq_u16(y_first);
+ y_second32 = vpaddlq_u16(y_second);
+
+ y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+ }
+
+ *u_mod = vqaddq_u16(*u_mod, y_reg);
+ *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors_first,
+ const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ uint16x8_t weight_first, weight_second;
+
+ uint16x8_t mul_first, mul_second;
+
+ uint16x8_t sum_row_1_first, sum_row_1_second;
+ uint16x8_t sum_row_2_first, sum_row_2_second;
+ uint16x8_t sum_row_3_first, sum_row_3_second;
+
+ uint16x8_t u_first, u_second;
+ uint16x8_t v_first, v_second;
+
+ uint16x8_t sum_row_first;
+ uint16x8_t sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(block_width == 16);
+ (void)block_width;
+
+ // Initialize the weights
+ if (blk_fw) {
+ weight_first = vdupq_n_u16(blk_fw[0]);
+ weight_second = vdupq_n_u16(blk_fw[1]);
+ } else {
+ weight_first = vdupq_n_u16(top_weight);
+ weight_second = weight_first;
+ }
+
+ // First row
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+ // Add luma values
+ get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+ sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ if (blk_fw) {
+ weight_first = vdupq_n_u16(blk_fw[2]);
+ weight_second = vdupq_n_u16(blk_fw[3]);
+ } else {
+ weight_first = vdupq_n_u16(bottom_weight);
+ weight_second = weight_first;
+ }
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+ sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+ mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+ }
+
+ sum_row_first = vqaddq_u16(sum_row_first, u_first);
+ sum_row_second = vqaddq_u16(sum_row_second, u_second);
+ sum_row_first = vqaddq_u16(sum_row_first, v_first);
+ sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors_first;
+ const int16_t *const *neighbors_second;
+
+ if (block_width == 16) {
+ // Special Case: The block width is 16 and we are operating on a row of 16
+ // chroma pixels. In this case, we can't use the usual left-middle-right
+ // pattern. We also don't support splitting now.
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ if (use_whole_blk) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ } else {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+
+ uint16x8_t weight;
+
+ uint16x8_t mul;
+
+ uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+ uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+ uint16x8_t u_sum_row, v_sum_row;
+
+ // Loop variable
+ unsigned int h;
+
+ // Initialize weight
+ if (blk_fw) {
+ weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+ } else {
+ weight = vdupq_n_u16(top_weight);
+ }
+
+ // First row
+ mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+ // Add chroma values
+ get_sum_8(u_dist, &u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+ u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+ get_sum_8(v_dist, &v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+ v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+ } else {
+ weight = vdupq_n_u16(bottom_weight);
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+ u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+ v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+ v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+ v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ } else {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ }
+
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+ const int *blk_fw_ptr = blk_fw;
+
+ uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+ store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+ ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+ y_accum, y_count, y_dist_ptr, u_dist_ptr,
+ v_dist_ptr);
+
+ apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw_ptr,
+ use_whole_blk, u_accum, u_count, v_accum,
+ v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
new file mode 100644
index 0000000000..61786d8f66
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
+ static int64_t block_error_##BSize##size_msa( \
+ const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+ int64_t err = 0; \
+ uint32_t loop_cnt; \
+ v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \
+ v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \
+ v2i64 sq_coeff_r, sq_coeff_l; \
+ v2i64 err0, err_dup0, err1, err_dup1; \
+ \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \
+ sq_coeff_l); \
+ DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ \
+ for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ } \
+ \
+ err_dup0 = __msa_splati_d(sq_coeff_r, 1); \
+ err_dup1 = __msa_splati_d(sq_coeff_l, 1); \
+ sq_coeff_r += err_dup0; \
+ sq_coeff_l += err_dup1; \
+ *ssz = __msa_copy_s_d(sq_coeff_r, 0); \
+ *ssz += __msa_copy_s_d(sq_coeff_l, 0); \
+ \
+ err_dup0 = __msa_splati_d(err0, 1); \
+ err_dup1 = __msa_splati_d(err1, 1); \
+ err0 += err_dup0; \
+ err1 += err_dup1; \
+ err = __msa_copy_s_d(err0, 0); \
+ err += __msa_copy_s_d(err1, 0); \
+ \
+ return err; \
+ }
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+BLOCK_ERROR_BLOCKSIZE_MSA(16);
+BLOCK_ERROR_BLOCKSIZE_MSA(64);
+BLOCK_ERROR_BLOCKSIZE_MSA(256);
+BLOCK_ERROR_BLOCKSIZE_MSA(1024);
+
+int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
+ const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+ int64_t *ssz) {
+ int64_t err;
+ const int16_t *coeff = (const int16_t *)coeff_ptr;
+ const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+ switch (blk_size) {
+ case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+ case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+ case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+ case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+ default:
+ err = vp9_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+ break;
+ }
+
+ return err;
+}
+#endif // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
new file mode 100644
index 0000000000..efbbe830db
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+
+static void fadst16_cols_step1_msa(const int16_t *input, int32_t stride,
+ const int32_t *const0, int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r15 = LD_SH(input + 15 * stride);
+ r7 = LD_SH(input + 7 * stride);
+ r8 = LD_SH(input + 8 * stride);
+ SLLI_4V(r0, r15, r7, r8, 2);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 8, 4, k2, k3);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * stride);
+ r4 = LD_SH(input + 4 * stride);
+ r11 = LD_SH(input + 11 * stride);
+ r12 = LD_SH(input + 12 * stride);
+ SLLI_4V(r3, r4, r11, r12, 2);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp2, int_buf, 8);
+ ST_SH2(tp1, tp3, int_buf + 4 * 8, 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+ ST_SH2(h0, h1, int_buf + 8 * 8, 8);
+ ST_SH2(h3, h2, int_buf + 12 * 8, 8);
+
+ r9 = LD_SH(input + 9 * stride);
+ r6 = LD_SH(input + 6 * stride);
+ r1 = LD_SH(input + stride);
+ r14 = LD_SH(input + 14 * stride);
+ SLLI_4V(r9, r6, r1, r14, 2);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r13 = LD_SH(input + 13 * stride);
+ r2 = LD_SH(input + 2 * stride);
+ r5 = LD_SH(input + 5 * stride);
+ r10 = LD_SH(input + 10 * stride);
+ SLLI_4V(r13, r2, r5, r10, 2);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_cols_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ int16_t *out_ptr = out + 128;
+ v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v4i32 k0, k1, k2, k3;
+
+ LD_SH2(int_buf + 3 * 8, 4 * 8, g13, g15);
+ LD_SH2(int_buf + 11 * 8, 4 * 8, g5, g7);
+ LD_SW2(const0 + 4 * 19, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 21);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ tp0 = LD_SH(int_buf + 4 * 8);
+ tp1 = LD_SH(int_buf + 5 * 8);
+ tp3 = LD_SH(int_buf + 10 * 8);
+ tp2 = LD_SH(int_buf + 14 * 8);
+ LD_SW2(const0 + 4 * 22, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 24);
+ MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ ST_SH(out4, (out + 3 * 16));
+ ST_SH(out5, (out_ptr + 4 * 16));
+
+ h1 = LD_SH(int_buf + 9 * 8);
+ h3 = LD_SH(int_buf + 12 * 8);
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ ST_SH(out12, (out + 2 * 16));
+ ST_SH(out13, (out_ptr + 5 * 16));
+
+ tp0 = LD_SH(int_buf);
+ tp1 = LD_SH(int_buf + 8);
+ tp2 = LD_SH(int_buf + 2 * 8);
+ tp3 = LD_SH(int_buf + 6 * 8);
+
+ BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+ out1 = -out1;
+ ST_SH(out0, (out));
+ ST_SH(out1, (out_ptr + 7 * 16));
+
+ h0 = LD_SH(int_buf + 8 * 8);
+ h2 = LD_SH(int_buf + 13 * 8);
+
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+ ST_SH(out8, (out + 16));
+ ST_SH(out9, (out_ptr + 6 * 16));
+
+ /* stage 4 */
+ LD_SW2(const0 + 4 * 25, 4, k0, k1);
+ LD_SW2(const0 + 4 * 27, 4, k2, k3);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ ST_SH(out2, (out + 7 * 16));
+ ST_SH(out3, (out_ptr));
+
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ ST_SH(out6, (out + 4 * 16));
+ ST_SH(out7, (out_ptr + 3 * 16));
+
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ ST_SH(out10, (out + 6 * 16));
+ ST_SH(out11, (out_ptr + 16));
+
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ ST_SH(out14, (out + 5 * 16));
+ ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_postproc_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+ out += 64;
+
+ /* load input data */
+ input += 128;
+ LD_SH8(input, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ FDCT_POSTPROC_2V_NEG_H(r0, r1);
+ FDCT_POSTPROC_2V_NEG_H(r2, r3);
+ FDCT_POSTPROC_2V_NEG_H(r4, r5);
+ FDCT_POSTPROC_2V_NEG_H(r6, r7);
+ ST_SH8(r0, r1, r2, r3, r4, r5, r6, r7, out, 8);
+ out += 64;
+
+ LD_SH8(input + 8, 16, l8, l9, l10, l11, l12, l13, l14, l15);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ FDCT_POSTPROC_2V_NEG_H(r8, r9);
+ FDCT_POSTPROC_2V_NEG_H(r10, r11);
+ FDCT_POSTPROC_2V_NEG_H(r12, r13);
+ FDCT_POSTPROC_2V_NEG_H(r14, r15);
+ ST_SH8(r8, r9, r10, r11, r12, r13, r14, r15, out, 8);
+}
+
+static void fadst16_rows_step1_msa(int16_t *input, const int32_t *const0,
+ int16_t *int_buf) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 tp0, tp1, tp2, tp3, g0, g1, g2, g3, g8, g9, g10, g11, h0, h1, h2, h3;
+ v4i32 k0, k1, k2, k3;
+
+ /* load input data */
+ r0 = LD_SH(input);
+ r7 = LD_SH(input + 7 * 8);
+ r8 = LD_SH(input + 8 * 8);
+ r15 = LD_SH(input + 15 * 8);
+
+ /* stage 1 */
+ LD_SW2(const0, 4, k0, k1);
+ LD_SW2(const0 + 4 * 2, 4, k2, k3);
+ MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+ r3 = LD_SH(input + 3 * 8);
+ r4 = LD_SH(input + 4 * 8);
+ r11 = LD_SH(input + 11 * 8);
+ r12 = LD_SH(input + 12 * 8);
+
+ LD_SW2(const0 + 4 * 4, 4, k0, k1);
+ LD_SW2(const0 + 4 * 6, 4, k2, k3);
+ MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+ /* stage 2 */
+ BUTTERFLY_4(g0, g2, g10, g8, tp0, tp2, tp3, tp1);
+ ST_SH2(tp0, tp1, int_buf, 4 * 8);
+ ST_SH2(tp2, tp3, int_buf + 8, 4 * 8);
+
+ LD_SW2(const0 + 4 * 8, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 10);
+ MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+ ST_SH2(h0, h3, int_buf + 8 * 8, 4 * 8);
+ ST_SH2(h1, h2, int_buf + 9 * 8, 4 * 8);
+
+ r1 = LD_SH(input + 8);
+ r6 = LD_SH(input + 6 * 8);
+ r9 = LD_SH(input + 9 * 8);
+ r14 = LD_SH(input + 14 * 8);
+
+ LD_SW2(const0 + 4 * 11, 4, k0, k1);
+ LD_SW2(const0 + 4 * 13, 4, k2, k3);
+ MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g0, g1, g2, g3);
+ ST_SH2(g1, g3, int_buf + 3 * 8, 4 * 8);
+
+ r2 = LD_SH(input + 2 * 8);
+ r5 = LD_SH(input + 5 * 8);
+ r10 = LD_SH(input + 10 * 8);
+ r13 = LD_SH(input + 13 * 8);
+
+ LD_SW2(const0 + 4 * 15, 4, k0, k1);
+ LD_SW2(const0 + 4 * 17, 4, k2, k3);
+ MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, h0, h1, h2, h3);
+ ST_SH2(h1, h3, int_buf + 11 * 8, 4 * 8);
+ BUTTERFLY_4(h0, h2, g2, g0, tp0, tp1, tp2, tp3);
+ ST_SH4(tp0, tp1, tp2, tp3, int_buf + 2 * 8, 4 * 8);
+}
+
+static void fadst16_rows_step2_msa(int16_t *int_buf, const int32_t *const0,
+ int16_t *out) {
+ int16_t *out_ptr = out + 8;
+ v8i16 tp0, tp1, tp2, tp3, g5, g7, g13, g15;
+ v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h10, h11;
+ v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+ v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+ v4i32 k0, k1, k2, k3;
+
+ g13 = LD_SH(int_buf + 3 * 8);
+ g15 = LD_SH(int_buf + 7 * 8);
+ g5 = LD_SH(int_buf + 11 * 8);
+ g7 = LD_SH(int_buf + 15 * 8);
+
+ LD_SW2(const0 + 4 * 19, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 21);
+ MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+ tp0 = LD_SH(int_buf + 4 * 8);
+ tp1 = LD_SH(int_buf + 5 * 8);
+ tp3 = LD_SH(int_buf + 10 * 8);
+ tp2 = LD_SH(int_buf + 14 * 8);
+
+ LD_SW2(const0 + 4 * 22, 4, k0, k1);
+ k2 = LD_SW(const0 + 4 * 24);
+ MADD_BF(tp0, tp1, tp2, tp3, k0, k1, k2, k0, out4, out6, out5, out7);
+ out4 = -out4;
+ ST_SH(out4, (out + 3 * 16));
+ ST_SH(out5, (out_ptr + 4 * 16));
+
+ h1 = LD_SH(int_buf + 9 * 8);
+ h3 = LD_SH(int_buf + 12 * 8);
+ MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+ out13 = -out13;
+ ST_SH(out12, (out + 2 * 16));
+ ST_SH(out13, (out_ptr + 5 * 16));
+
+ tp0 = LD_SH(int_buf);
+ tp1 = LD_SH(int_buf + 8);
+ tp2 = LD_SH(int_buf + 2 * 8);
+ tp3 = LD_SH(int_buf + 6 * 8);
+
+ BUTTERFLY_4(tp0, tp1, tp3, tp2, out0, out1, h11, h10);
+ out1 = -out1;
+ ST_SH(out0, (out));
+ ST_SH(out1, (out_ptr + 7 * 16));
+
+ h0 = LD_SH(int_buf + 8 * 8);
+ h2 = LD_SH(int_buf + 13 * 8);
+ BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+ out8 = -out8;
+ ST_SH(out8, (out + 16));
+ ST_SH(out9, (out_ptr + 6 * 16));
+
+ /* stage 4 */
+ LD_SW2(const0 + 4 * 25, 4, k0, k1);
+ LD_SW2(const0 + 4 * 27, 4, k2, k3);
+ MADD_SHORT(h10, h11, k1, k2, out2, out3);
+ ST_SH(out2, (out + 7 * 16));
+ ST_SH(out3, (out_ptr));
+
+ MADD_SHORT(out6, out7, k0, k3, out6, out7);
+ ST_SH(out6, (out + 4 * 16));
+ ST_SH(out7, (out_ptr + 3 * 16));
+
+ MADD_SHORT(out10, out11, k0, k3, out10, out11);
+ ST_SH(out10, (out + 6 * 16));
+ ST_SH(out11, (out_ptr + 16));
+
+ MADD_SHORT(out14, out15, k1, k2, out14, out15);
+ ST_SH(out14, (out + 5 * 16));
+ ST_SH(out15, (out_ptr + 2 * 16));
+}
+
+static void fadst16_transpose_msa(int16_t *input, int16_t *out) {
+ v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+ v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+ /* load input data */
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+ out += 16 * 8;
+
+ /* load input data */
+ input += 128;
+ LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14,
+ l7, l15);
+ TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, r0, r1, r2, r3, r4, r5, r6,
+ r7);
+ TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, r8, r9, r10, r11,
+ r12, r13, r14, r15);
+ ST_SH8(r0, r8, r1, r9, r2, r10, r3, r11, out, 8);
+ ST_SH8(r4, r12, r5, r13, r6, r14, r7, r15, (out + 64), 8);
+}
+
+static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) {
+ int16_t *temp = intermediate;
+ int16_t *out = output;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11;
+ v8i16 in12, in13, in14, in15;
+
+ LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+ temp = intermediate + 8;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ FDCT_POSTPROC_2V_NEG_H(in0, in1);
+ FDCT_POSTPROC_2V_NEG_H(in2, in3);
+ FDCT_POSTPROC_2V_NEG_H(in4, in5);
+ FDCT_POSTPROC_2V_NEG_H(in6, in7);
+ FDCT_POSTPROC_2V_NEG_H(in8, in9);
+ FDCT_POSTPROC_2V_NEG_H(in10, in11);
+ FDCT_POSTPROC_2V_NEG_H(in12, in13);
+ FDCT_POSTPROC_2V_NEG_H(in14, in15);
+ BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11,
+ in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6,
+ tmp7, in8, in9, in10, in11, in12, in13, in14, in15);
+ temp = intermediate;
+ ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16);
+ FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ temp = intermediate;
+ LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+ FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+ tmp1, in1, tmp2, in2, tmp3, in3);
+ ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16);
+ TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+ tmp5, in5, tmp6, in6, tmp7, in7);
+ out = output + 8;
+ ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16);
+}
+
+void vp9_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ DECLARE_ALIGNED(32, int16_t, tmp[256]);
+ DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
+ DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
+ int32_t i;
+ int16_t *ptmpbuf = &tmp_buf[0];
+ int16_t *trans = &trans_buf[0];
+ const int32_t const_arr[29 * 4] = {
+ 52707308, 52707308, 52707308, 52707308, -1072430300,
+ -1072430300, -1072430300, -1072430300, 795618043, 795618043,
+ 795618043, 795618043, -721080468, -721080468, -721080468,
+ -721080468, 459094491, 459094491, 459094491, 459094491,
+ -970646691, -970646691, -970646691, -970646691, 1010963856,
+ 1010963856, 1010963856, 1010963856, -361743294, -361743294,
+ -361743294, -361743294, 209469125, 209469125, 209469125,
+ 209469125, -1053094788, -1053094788, -1053094788, -1053094788,
+ 1053160324, 1053160324, 1053160324, 1053160324, 639644520,
+ 639644520, 639644520, 639644520, -862444000, -862444000,
+ -862444000, -862444000, 1062144356, 1062144356, 1062144356,
+ 1062144356, -157532337, -157532337, -157532337, -157532337,
+ 260914709, 260914709, 260914709, 260914709, -1041559667,
+ -1041559667, -1041559667, -1041559667, 920985831, 920985831,
+ 920985831, 920985831, -551995675, -551995675, -551995675,
+ -551995675, 596522295, 596522295, 596522295, 596522295,
+ 892853362, 892853362, 892853362, 892853362, -892787826,
+ -892787826, -892787826, -892787826, 410925857, 410925857,
+ 410925857, 410925857, -992012162, -992012162, -992012162,
+ -992012162, 992077698, 992077698, 992077698, 992077698,
+ 759246145, 759246145, 759246145, 759246145, -759180609,
+ -759180609, -759180609, -759180609, -759222975, -759222975,
+ -759222975, -759222975, 759288511, 759288511, 759288511,
+ 759288511
+ };
+
+ switch (tx_type) {
+ case DCT_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case ADST_DCT:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
+ }
+ break;
+ case DCT_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ case ADST_ADST:
+ /* column transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
+ fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
+ }
+
+ fadst16_transpose_postproc_msa(tmp, trans);
+
+ /* row transform */
+ for (i = 0; i < 2; ++i) {
+ fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
+ fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
+ }
+
+ fadst16_transpose_msa(tmp, output);
+ break;
+ default: assert(0); break;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
new file mode 100644
index 0000000000..9c5cc12ef0
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fwht4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ in0 += in1;
+ in3 -= in2;
+ in4 = (in0 - in3) >> 1;
+ SUB2(in4, in1, in4, in2, in1, in2);
+ in0 -= in2;
+ in3 += in1;
+
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+ in0 += in2;
+ in1 -= in3;
+ in4 = (in0 - in1) >> 1;
+ SUB2(in4, in2, in4, in3, in2, in3);
+ in0 -= in3;
+ in1 += in2;
+
+ SLLI_4V(in0, in1, in2, in3, 2);
+
+ TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+ ST4x2_UB(in0, output, 4);
+ ST4x2_UB(in3, output + 4, 4);
+ ST4x2_UB(in1, output + 8, 4);
+ ST4x2_UB(in2, output + 12, 4);
+}
+
+void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ v8i16 in0, in1, in2, in3;
+
+ LD_SH4(input, stride, in0, in1, in2, in3);
+
+ /* fdct4 pre-process */
+ {
+ v8i16 temp, mask;
+ v16i8 zero = { 0 };
+ v16i8 one = __msa_ldi_b(1);
+
+ mask = (v8i16)__msa_sldi_b(zero, one, 15);
+ SLLI_4V(in0, in1, in2, in3, 4);
+ temp = __msa_ceqi_h(in0, 0);
+ temp = (v8i16)__msa_xori_b((v16u8)temp, 255);
+ temp = mask & temp;
+ in0 += temp;
+ }
+
+ switch (tx_type) {
+ case DCT_DCT:
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_DCT:
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case DCT_ADST:
+ VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ case ADST_ADST:
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3);
+ break;
+ default: assert(0); break;
+ }
+
+ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);
+ ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+ SRA_4V(in0, in1, in2, in3, 2);
+ PCKEV_D2_SH(in1, in0, in3, in2, in0, in2);
+ ST_SH2(in0, in2, output, 8);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
new file mode 100644
index 0000000000..26d81aa9ef
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
+
+void vp9_fht8x8_msa(const int16_t *input, int16_t *output, int32_t stride,
+ int32_t tx_type) {
+ v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+
+ LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+ SLLI_4V(in0, in1, in2, in3, 2);
+ SLLI_4V(in4, in5, in6, in7, 2);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case ADST_DCT:
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case DCT_ADST:
+ VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ case ADST_ADST:
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2,
+ in3, in4, in5, in6, in7);
+ VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ break;
+ default: assert(0); break;
+ }
+
+ TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+ ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
new file mode 100644
index 0000000000..fa1af2fc57
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+
+#include "vpx_dsp/mips/fwd_txfm_msa.h"
+#include "vpx_dsp/mips/txfm_macros_msa.h"
+#include "vpx_ports/mem.h"
+
+#define VP9_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+ out3, out4, out5, out6, out7) \
+ { \
+ v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \
+ v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \
+ v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \
+ cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \
+ v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \
+ cospi_24_64, -cospi_24_64, 0, 0 }; \
+ \
+ SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in7, in0, in4, in3); \
+ \
+ SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \
+ cnst2_m = -cnst0_m; \
+ ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \
+ SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \
+ cnst4_m = -cnst2_m; \
+ ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \
+ cnst2_m, cnst3_m, in5, in2, in6, in1); \
+ BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \
+ out7 = -s0_m; \
+ out0 = s1_m; \
+ \
+ SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \
+ \
+ ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \
+ cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ cnst1_m = cnst0_m; \
+ \
+ ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \
+ ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \
+ DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \
+ cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \
+ \
+ SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \
+ cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \
+ \
+ ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \
+ ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \
+ out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \
+ out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \
+ out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \
+ out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \
+ \
+ out1 = -out1; \
+ out3 = -out3; \
+ out5 = -out5; \
+ }
+
+#define VP9_FADST4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ v4i32 s0_m, s1_m, s2_m, s3_m, constant_m; \
+ v4i32 in0_r_m, in1_r_m, in2_r_m, in3_r_m; \
+ \
+ UNPCK_R_SH_SW(in0, in0_r_m); \
+ UNPCK_R_SH_SW(in1, in1_r_m); \
+ UNPCK_R_SH_SW(in2, in2_r_m); \
+ UNPCK_R_SH_SW(in3, in3_r_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_4_9); \
+ MUL2(in0_r_m, constant_m, in3_r_m, constant_m, s1_m, s0_m); \
+ \
+ constant_m = __msa_fill_w(sinpi_1_9); \
+ s0_m += in0_r_m * constant_m; \
+ s1_m -= in1_r_m * constant_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_2_9); \
+ s0_m += in1_r_m * constant_m; \
+ s1_m += in3_r_m * constant_m; \
+ \
+ s2_m = in0_r_m + in1_r_m - in3_r_m; \
+ \
+ constant_m = __msa_fill_w(sinpi_3_9); \
+ MUL2(in2_r_m, constant_m, s2_m, constant_m, s3_m, in1_r_m); \
+ \
+ in0_r_m = s0_m + s3_m; \
+ s2_m = s1_m - s3_m; \
+ s3_m = s1_m - s0_m + s3_m; \
+ \
+ SRARI_W4_SW(in0_r_m, in1_r_m, s2_m, s3_m, DCT_CONST_BITS); \
+ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
+ out0, out1, out2, out3); \
+ }
+#endif // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
new file mode 100644
index 0000000000..4d31558471
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+ // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right
+ // shift.
+ return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+ const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+ return vec_xor(vec_add(a, mask), mask);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+ a = vec_max(a, vec_perm(a, a, vec_perm64));
+ a = vec_max(a, vec_perm(a, a, vec_perm32));
+ return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+ bool16x8_t zero_coeff0, zero_coeff1;
+
+ int16x8_t round = vec_vsx_ld(0, round_ptr);
+ int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+ int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+ int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+ int16x8_t scan0 = vec_vsx_ld(0, iscan);
+ int16x8_t scan1 = vec_vsx_ld(16, iscan);
+
+ (void)scan;
+
+ // First set of 8 coeff starts with DC + 7 AC
+ qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+ zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+ qcoeff0 = vec_sign(qcoeff0, coeff0);
+ vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+ // Remove DC value from round and quant
+ round = vec_splat(round, 1);
+ quant = vec_splat(quant, 1);
+
+ // Remove DC value from dequant
+ dequant = vec_splat(dequant, 1);
+
+ // Second set of 8 coeff starts with (all AC)
+ qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+ zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+ qcoeff1 = vec_sign(qcoeff1, coeff1);
+ vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+ eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+ // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per
+ // loop iteration.
+ // for 8x8: 16 + 2 x 24 = 64
+ // for 16x16: 16 + 10 x 24 = 256
+ if (n_coeffs > 16) {
+ int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+ bool16x8_t zero_coeff2;
+
+ int index = 16;
+ int off0 = 32;
+ int off1 = 48;
+ int off2 = 64;
+
+ do {
+ coeff0 = vec_vsx_ld(off0, coeff_ptr);
+ coeff1 = vec_vsx_ld(off1, coeff_ptr);
+ coeff2 = vec_vsx_ld(off2, coeff_ptr);
+ scan0 = vec_vsx_ld(off0, iscan);
+ scan1 = vec_vsx_ld(off1, iscan);
+ scan2 = vec_vsx_ld(off2, iscan);
+
+ qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+ zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+ qcoeff0 = vec_sign(qcoeff0, coeff0);
+ vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+ dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+
+ qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+ zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+ qcoeff1 = vec_sign(qcoeff1, coeff1);
+ vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+ dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+
+ qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant);
+ zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+ qcoeff2 = vec_sign(qcoeff2, coeff2);
+ vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+ dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+ eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+ eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+ eob = vec_max(eob, eob2);
+
+ index += 24;
+ off0 += 48;
+ off1 += 48;
+ off2 += 48;
+ } while (index < n_coeffs);
+ }
+
+ eob = vec_max_across(eob);
+ *eob_ptr = eob[0] + 1;
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+ return vec_sr(a, vec_shift_sign_s32);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+ int16x8_t dequant) {
+ int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+ int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+ // Add 1 if negative to round towards zero because the C uses division.
+ dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+ dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+ dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+ dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+ return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+ // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+ // (32 * 32 - 16) / 24 = 42
+ int num_itr = 42;
+ // Offsets are in bytes, 16 coeffs = 32 bytes
+ int off0 = 32;
+ int off1 = 48;
+ int off2 = 64;
+
+ int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+ bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1;
+
+ int16x8_t round = vec_vsx_ld(0, round_ptr);
+ int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+ int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+ int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+ int16x8_t scan0 = vec_vsx_ld(0, iscan);
+ int16x8_t scan1 = vec_vsx_ld(16, iscan);
+ int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2));
+ int16x8_t abs_coeff0 = vec_abs(coeff0);
+ int16x8_t abs_coeff1 = vec_abs(coeff1);
+
+ (void)scan;
+ (void)n_coeffs;
+
+ mask0 = vec_cmpge(abs_coeff0, thres);
+ round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+ // First set of 8 coeff starts with DC + 7 AC
+ qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+ qcoeff0 = vec_and(qcoeff0, mask0);
+ zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+ qcoeff0 = vec_sign(qcoeff0, coeff0);
+ vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+ dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+ vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+ // Remove DC value from thres, round, quant and dequant
+ thres = vec_splat(thres, 1);
+ round = vec_splat(round, 1);
+ quant = vec_splat(quant, 1);
+ dequant = vec_splat(dequant, 1);
+
+ mask1 = vec_cmpge(abs_coeff1, thres);
+
+ // Second set of 8 coeff starts with (all AC)
+ qcoeff1 =
+ vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16);
+ qcoeff1 = vec_and(qcoeff1, mask1);
+ zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+ qcoeff1 = vec_sign(qcoeff1, coeff1);
+ vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+ dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+ vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+ eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+ do {
+ int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+ bool16x8_t zero_coeff2, mask2;
+ coeff0 = vec_vsx_ld(off0, coeff_ptr);
+ coeff1 = vec_vsx_ld(off1, coeff_ptr);
+ coeff2 = vec_vsx_ld(off2, coeff_ptr);
+ scan0 = vec_vsx_ld(off0, iscan);
+ scan1 = vec_vsx_ld(off1, iscan);
+ scan2 = vec_vsx_ld(off2, iscan);
+
+ abs_coeff0 = vec_abs(coeff0);
+ abs_coeff1 = vec_abs(coeff1);
+ abs_coeff2 = vec_abs(coeff2);
+
+ qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+ qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16);
+ qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16);
+
+ mask0 = vec_cmpge(abs_coeff0, thres);
+ mask1 = vec_cmpge(abs_coeff1, thres);
+ mask2 = vec_cmpge(abs_coeff2, thres);
+
+ qcoeff0 = vec_and(qcoeff0, mask0);
+ qcoeff1 = vec_and(qcoeff1, mask1);
+ qcoeff2 = vec_and(qcoeff2, mask2);
+
+ zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+ zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+ zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+
+ qcoeff0 = vec_sign(qcoeff0, coeff0);
+ qcoeff1 = vec_sign(qcoeff1, coeff1);
+ qcoeff2 = vec_sign(qcoeff2, coeff2);
+
+ vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+ vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+ vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+ dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+ dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+ dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant);
+
+ vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+ vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+ eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+ eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+ eob = vec_max(eob, eob2);
+
+ off0 += 48;
+ off1 += 48;
+ off2 += 48;
+ num_itr--;
+ } while (num_itr != 0);
+
+ eob = vec_max_across(eob);
+ *eob_ptr = eob[0] + 1;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c
new file mode 100644
index 0000000000..acc3764c7a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file in the root of the source tree. An additional
+ * intellectual property rights grant can be found in the file PATENTS.
+ * All contributing project authors may be found in the AUTHORS file in
+ * the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_alt_ref_aq.h"
+
+struct ALT_REF_AQ {
+ int dummy;
+};
+
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
+ return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
+}
+
+void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self) { vpx_free(self); }
+
+void vp9_alt_ref_aq_upload_map(struct ALT_REF_AQ *const self,
+ const struct MATX_8U *segmentation_map) {
+ (void)self;
+ (void)segmentation_map;
+}
+
+void vp9_alt_ref_aq_set_nsegments(struct ALT_REF_AQ *const self,
+ int nsegments) {
+ (void)self;
+ (void)nsegments;
+}
+
+void vp9_alt_ref_aq_setup_mode(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi) {
+ (void)cpi;
+ (void)self;
+}
+
+// set basic segmentation to the altref's one
+void vp9_alt_ref_aq_setup_map(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi) {
+ (void)cpi;
+ (void)self;
+}
+
+// restore cpi->aq_mode
+void vp9_alt_ref_aq_unset_all(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi) {
+ (void)cpi;
+ (void)self;
+}
+
+int vp9_alt_ref_aq_disable_if(const struct ALT_REF_AQ *self,
+ int segmentation_overhead, int bandwidth) {
+ (void)bandwidth;
+ (void)self;
+ (void)segmentation_overhead;
+
+ return 0;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h
new file mode 100644
index 0000000000..22a657e035
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file in the root of the source tree. An additional
+ * intellectual property rights grant can be found in the file PATENTS.
+ * All contributing project authors may be found in the AUTHORS file in
+ * the root of the source tree.
+ */
+
+/*
+ * \file vp9_alt_ref_aq.h
+ *
+ * This file contains public interface for setting up adaptive segmentation
+ * for altref frames. Go to alt_ref_aq_private.h for implmentation details.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
+
+#include "vpx/vpx_integer.h"
+
+// Where to disable segmentation
+#define ALT_REF_AQ_LOW_BITRATE_BOUNDARY 150
+
+// Last frame always has overall quality = 0,
+// so it is questionable if I can process it
+#define ALT_REF_AQ_APPLY_TO_LAST_FRAME 1
+
+// If I should try to compare gain
+// against segmentation overhead
+#define ALT_REF_AQ_PROTECT_GAIN 0
+
+// Threshold to disable segmentation
+#define ALT_REF_AQ_PROTECT_GAIN_THRESH 0.5
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Simple structure for storing images
+struct MATX_8U {
+ int rows;
+ int cols;
+ int stride;
+
+ uint8_t *data;
+};
+
+struct VP9_COMP;
+struct ALT_REF_AQ;
+
+/*!\brief Constructor
+ *
+ * \return Instance of the class
+ */
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
+
+/*!\brief Upload segmentation_map to self object
+ *
+ * \param self Instance of the class
+ * \param segmentation_map Segmentation map to upload
+ */
+void vp9_alt_ref_aq_upload_map(struct ALT_REF_AQ *const self,
+ const struct MATX_8U *segmentation_map);
+
+/*!\brief Return pointer to the altref segmentation map
+ *
+ * \param self Instance of the class
+ * \param segmentation_overhead Segmentation overhead in bytes
+ * \param bandwidth Current frame bandwidth in bytes
+ *
+ * \return Boolean value to disable segmentation
+ */
+int vp9_alt_ref_aq_disable_if(const struct ALT_REF_AQ *self,
+ int segmentation_overhead, int bandwidth);
+
+/*!\brief Set number of segments
+ *
+ * It is used for delta quantizer computations
+ * and thus it can be larger than
+ * maximum value of the segmentation map
+ *
+ * \param self Instance of the class
+ * \param nsegments Maximum number of segments
+ */
+void vp9_alt_ref_aq_set_nsegments(struct ALT_REF_AQ *const self, int nsegments);
+
+/*!\brief Set up LOOKAHEAD_AQ segmentation mode
+ *
+ * Set up segmentation mode to LOOKAHEAD_AQ
+ * (expected future frames prediction
+ * quality refering to the current frame).
+ *
+ * \param self Instance of the class
+ * \param cpi Encoder context
+ */
+void vp9_alt_ref_aq_setup_mode(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi);
+
+/*!\brief Set up LOOKAHEAD_AQ segmentation map and delta quantizers
+ *
+ * \param self Instance of the class
+ * \param cpi Encoder context
+ */
+void vp9_alt_ref_aq_setup_map(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi);
+
+/*!\brief Restore main segmentation map mode and reset the class variables
+ *
+ * \param self Instance of the class
+ * \param cpi Encoder context
+ */
+void vp9_alt_ref_aq_unset_all(struct ALT_REF_AQ *const self,
+ struct VP9_COMP *const cpi);
+
+/*!\brief Destructor
+ *
+ * \param self Instance of the class
+ */
+void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c
new file mode 100644
index 0000000000..dba017ffcc
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 1.0, 0.75, 0.6, 0.5,
+ 0.4, 0.3, 0.25 };
+
+// Sets segment id 0 for the equatorial region, 1 for temperate region
+// and 2 for the polar regions
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows) {
+ if (mi_row < mi_rows / 8 || mi_row > mi_rows - mi_rows / 8)
+ return 2;
+ else if (mi_row < mi_rows / 4 || mi_row > mi_rows - mi_rows / 4)
+ return 1;
+ else
+ return 0;
+}
+
+void vp9_360aq_frame_setup(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ if (frame_is_intra_only(cm) || cpi->force_update_segmentation ||
+ cm->error_resilient_mode) {
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ vpx_clear_system_state();
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ int qindex_delta =
+ vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ rate_ratio[i], cm->bit_depth);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+
+ // No need to enable SEG_LVL_ALT_Q for this segment.
+ if (rate_ratio[i] == 1.0) {
+ continue;
+ }
+
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h
new file mode 100644
index 0000000000..749d3c198a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_360.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_
+#define VPX_VP9_ENCODER_VP9_AQ_360_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_360aq_segment_id(int mi_row, int mi_rows);
+void vp9_360aq_frame_setup(VP9_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_AQ_360_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c
new file mode 100644
index 0000000000..bd3812036c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_complexity.h"
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 1.75, 1.25, 1.05, 1.00, 0.90 },
+ { 2.00, 1.50, 1.15, 1.00, 0.85 },
+ { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 0.15, 0.30, 0.55, 2.00, 100.0 },
+ { 0.20, 0.40, 0.65, 2.00, 100.0 },
+ { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { -4.0, -3.0, -2.0, 100.00, 100.0 },
+ { -3.5, -2.5, -1.5, 100.00, 100.0 },
+ { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
+ // Approximate base quatizer (truncated to int)
+ const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
+ return (base_quant > 10) + (base_quant > 25);
+}
+
+void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ struct segmentation *const seg = &cm->seg;
+
+ // Make SURE use of floating point in this function is safe.
+ vpx_clear_system_state();
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame || cpi->force_update_segmentation ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ int segment;
+ const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+ // Clear down the segment map.
+ memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+ vp9_clearall_segfeatures(seg);
+
+ // Segmentation only makes sense if the target bits per SB is above a
+ // threshold. Below this the overheads will usually outweigh any benefit.
+ if (cpi->rc.sb64_target_rate < 256) {
+ vp9_disable_segmentation(seg);
+ return;
+ }
+
+ vp9_enable_segmentation(seg);
+
+ // Select delta coding method.
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment.
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG) continue;
+
+ qindex_delta = vp9_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
+
+ // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+ if ((cm->base_qindex + qindex_delta) > 0) {
+ vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+ }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
+ int x, y;
+ int i;
+ unsigned char segment;
+
+ if (0) {
+ segment = DEFAULT_AQ2_SEG;
+ } else {
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits * 256 units.
+ const int target_rate =
+ (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+ double logvar;
+ double low_var_thresh;
+ const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
+
+ vpx_clear_system_state();
+ low_var_thresh = (cpi->oxcf.pass == 2) ? VPXMAX(cpi->twopass.mb_av_energy,
+ MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+ logvar = vp9_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
+ break;
+ }
+ }
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64.
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h
new file mode 100644
index 0000000000..d3cb34c013
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/common/vp9_enums.h"
+
+struct VP9_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs, int mi_row, int mi_col,
+ int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
new file mode 100644
index 0000000000..28ab10a13b
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+static const uint8_t VP9_VAR_OFFS[64] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+ size_t last_coded_q_map_size;
+ CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
+ if (cr == NULL) return NULL;
+
+ cr->map = vpx_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+ if (cr->map == NULL) {
+ vp9_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+ cr->last_coded_q_map = vpx_malloc(last_coded_q_map_size);
+ if (cr->last_coded_q_map == NULL) {
+ vp9_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ assert(MAXQ <= 255);
+ memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+ cr->counter_encode_maxq_scene_change = 0;
+ cr->content_mode = 1;
+ return cr;
+}
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+ if (cr != NULL) {
+ vpx_free(cr->map);
+ vpx_free(cr->last_coded_q_map);
+ vpx_free(cr);
+ }
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr, const MODE_INFO *mi,
+ int64_t rate, int64_t dist, int bsize) {
+ MV mv = mi->mv[0].as_mv;
+ // Reject the block for lower-qp coding if projected distortion
+ // is above the threshold, and any of the following is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mi)))
+ return CR_SEGMENT_ID_BASE;
+ else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+ is_inter_block(mi) && mi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10)
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type, q,
+ rate_factor, cpi->common.bit_depth);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
+ }
+ return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
+ double correction_factor) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int estimated_bits;
+ int mbs = cm->MBs;
+ int num8x8bl = mbs << 2;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+ // Take segment weighted average for estimated bits.
+ estimated_bits =
+ (int)((1.0 - weight_segment1 - weight_segment2) *
+ vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+ correction_factor, cm->bit_depth) +
+ weight_segment1 *
+ vp9_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[1],
+ mbs, correction_factor, cm->bit_depth) +
+ weight_segment2 *
+ vp9_estimate_bits_at_q(cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[2],
+ mbs, correction_factor, cm->bit_depth));
+ return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
+ double correction_factor) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int deltaq = 0;
+ if (cpi->oxcf.speed < 8)
+ deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ else
+ deltaq = -(cr->max_qdelta_perc * i) / 200;
+ // Take segment weighted average for bits per mb.
+ bits_per_mb = (int)((1.0 - cr->weight_segment) *
+ vp9_rc_bits_per_mb(cm->frame_type, i,
+ correction_factor, cm->bit_depth) +
+ cr->weight_segment *
+ vp9_rc_bits_per_mb(cm->frame_type, i + deltaq,
+ correction_factor, cm->bit_depth));
+ return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi, MODE_INFO *const mi,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ struct macroblock_plane *const p) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ int refresh_this_block = candidate_refresh_aq(cr, mi, rate, dist, bsize);
+ // Default is to not update the refresh map.
+ int new_map_value = cr->map[block_index];
+ int x = 0;
+ int y = 0;
+
+ int is_skin = 0;
+ if (refresh_this_block == 0 && bsize <= BLOCK_16X16 &&
+ cpi->use_skin_detection) {
+ is_skin =
+ vp9_compute_skin_block(p[0].src.buf, p[1].src.buf, p[2].src.buf,
+ p[0].src.stride, p[1].src.stride, bsize, 0, 0);
+ if (is_skin) refresh_this_block = 1;
+ }
+
+ if (cpi->oxcf.rc_mode == VPX_VBR && mi->ref_frame[0] == GOLDEN_FRAME)
+ refresh_this_block = 0;
+
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cpi->sf.use_nonrd_pick_mode &&
+ cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+ mi->segment_id = refresh_this_block;
+ // Reset segment_id if it will be skipped.
+ if (skip) mi->segment_id = CR_SEGMENT_ID_BASE;
+ }
+
+ // Update the cyclic refresh map, to be used for setting segmentation map
+ // for the next frame. If the block will be refreshed this frame, mark it
+ // as clean. The magnitude of the -ve influences how long before we consider
+ // it for refresh again.
+ if (cyclic_refresh_segment_id_boosted(mi->segment_id)) {
+ new_map_value = -cr->time_for_refresh;
+ } else if (refresh_this_block) {
+ // Else if it is accepted as candidate for refresh, and has not already
+ // been refreshed (marked as 1) then mark it as a candidate for cleanup
+ // for future time (marked as 0), otherwise don't update it.
+ if (cr->map[block_index] == 1) new_map_value = 0;
+ } else {
+ // Leave it marked as block that is not candidate for refresh.
+ new_map_value = 1;
+ }
+
+ // Update entries in the cyclic refresh map with new_map_value, and
+ // copy mbmi->segment_id into global segmentation map.
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ int map_offset = block_index + y * cm->mi_cols + x;
+ cr->map[map_offset] = new_map_value;
+ cpi->segmentation_map[map_offset] = mi->segment_id;
+ }
+}
+
+void vp9_cyclic_refresh_update_sb_postencode(VP9_COMP *const cpi,
+ const MODE_INFO *const mi,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ int x, y;
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ int map_offset = block_index + y * cm->mi_cols + x;
+ // Inter skip blocks were clearly not coded at the current qindex, so
+ // don't update the map for them. For cases where motion is non-zero or
+ // the reference frame isn't the previous frame, the previous value in
+ // the map for this spatial location is not entirely correct.
+ if ((!is_inter_block(mi) || !mi->skip) &&
+ mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] =
+ clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ);
+ } else if (is_inter_block(mi) && mi->skip &&
+ mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] = VPXMIN(
+ clamp(cm->base_qindex + cr->qindex_delta[mi->segment_id], 0, MAXQ),
+ cr->last_coded_q_map[map_offset]);
+ }
+ }
+}
+
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
+void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi = cm->mi_grid_visible;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ RATE_CONTROL *const rc = &cpi->rc;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ double fraction_low = 0.0;
+ int force_gf_refresh = 0;
+ int low_content_frame = 0;
+ int mi_row, mi_col;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ MV mv = mi[0]->mv[0].as_mv;
+ int map_index = mi_row * cm->mi_cols + mi_col;
+ if (cyclic_refresh_segment_id(seg_map[map_index]) == CR_SEGMENT_ID_BOOST1)
+ cr->actual_num_seg1_blocks++;
+ else if (cyclic_refresh_segment_id(seg_map[map_index]) ==
+ CR_SEGMENT_ID_BOOST2)
+ cr->actual_num_seg2_blocks++;
+ // Accumulate low_content_frame.
+ if (is_inter_block(mi[0]) && abs(mv.row) < 16 && abs(mv.col) < 16)
+ low_content_frame++;
+ mi++;
+ }
+ mi += 8;
+ }
+ // Check for golden frame update: only for non-SVC and non-golden boost.
+ if (!cpi->use_svc && cpi->ext_refresh_frame_flags_pending == 0 &&
+ !cpi->oxcf.gf_cbr_boost_pct) {
+ // Force this frame as a golden update frame if this frame changes the
+ // resolution (resize_pending != 0).
+ if (cpi->resize_pending != 0) {
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ force_gf_refresh = 1;
+ }
+ // Update average of low content/motion in the frame.
+ fraction_low = (double)low_content_frame / (cm->mi_rows * cm->mi_cols);
+ cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+ if (!force_gf_refresh && cpi->refresh_golden_frame == 1 &&
+ rc->frames_since_key > rc->frames_since_golden + 1) {
+ // Don't update golden reference if the amount of low_content for the
+ // current encoded frame is small, or if the recursive average of the
+ // low_content over the update interval window falls below threshold.
+ if (fraction_low < 0.65 || cr->low_content_avg < 0.6) {
+ cpi->refresh_golden_frame = 0;
+ }
+ // Reset for next internal.
+ cr->low_content_avg = fraction_low;
+ }
+ }
+}
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple of the refresh period,
+ // with some max limit. Depending on past encoding stats, GF flag may be
+ // reset and update may not occur until next baseline_gf_interval.
+ if (cr->percent_refresh > 0)
+ rc->baseline_gf_interval = VPXMIN(4 * (100 / cr->percent_refresh), 40);
+ else
+ rc->baseline_gf_interval = 40;
+ if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
+ if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40 &&
+ cr->content_mode)
+ rc->baseline_gf_interval = 10;
+}
+
+static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index,
+ int sb_col_index) {
+ unsigned int source_variance;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int ystride = cpi->Source->y_stride;
+ unsigned int sse;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+ source_variance =
+ cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse);
+ if (source_variance == 0) {
+ uint64_t block_sad;
+ const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+ const int last_ystride = cpi->Last_Source->y_stride;
+ last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+ block_sad =
+ cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride);
+ if (block_sad == 0) return 1;
+ }
+ return 0;
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ int consec_zero_mv_thresh = 0;
+ int qindex_thresh = 0;
+ int count_sel = 0;
+ int count_tot = 0;
+ memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+ sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+ consec_zero_mv_thresh = 100;
+ }
+ qindex_thresh =
+ cpi->oxcf.content == VP9E_CONTENT_SCREEN
+ ? vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+ : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex);
+ // More aggressive settings for noisy content.
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+ cr->content_mode) {
+ consec_zero_mv_thresh = 60;
+ qindex_thresh =
+ VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex),
+ cm->base_qindex);
+ }
+ do {
+ int sum_map = 0;
+ int consec_zero_mv_thresh_block = consec_zero_mv_thresh;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * MI_BLOCK_SIZE;
+ int mi_col = sb_col_index * MI_BLOCK_SIZE;
+ int flat_static_blocks = 0;
+ int compute_content = 1;
+ assert(mi_row >= 0 && mi_row < cm->mi_rows);
+ assert(mi_col >= 0 && mi_col < cm->mi_cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) compute_content = 0;
+#endif
+ if (cr->content_mode == 0 || cpi->Last_Source == NULL ||
+ cpi->Last_Source->y_width != cpi->Source->y_width ||
+ cpi->Last_Source->y_height != cpi->Source->y_height)
+ compute_content = 0;
+ bl_index = mi_row * cm->mi_cols + mi_col;
+ // Loop through all 8x8 blocks in superblock and update map.
+ xmis =
+ VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+ ymis =
+ VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium &&
+ (xmis <= 2 || ymis <= 2))
+ consec_zero_mv_thresh_block = 4;
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ const int bl_index2 = bl_index + y * cm->mi_cols + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later depending on the coding mode.
+ if (cr->map[bl_index2] == 0) {
+ count_tot++;
+ if (cr->content_mode == 0 ||
+ cr->last_coded_q_map[bl_index2] > qindex_thresh ||
+ cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) {
+ sum_map++;
+ count_sel++;
+ }
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ if (sum_map >= xmis * ymis / 2) {
+ // This superblock is a candidate for refresh:
+ // compute spatial variance and exclude blocks that are spatially flat
+ // and stationary. Note: this is currently only done for screne content
+ // mode.
+ if (compute_content && cr->skip_flat_static_blocks)
+ flat_static_blocks =
+ is_superblock_flat_static(cpi, sb_row_index, sb_col_index);
+ if (!flat_static_blocks) {
+ // Label this superblock as segment 1.
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+ }
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+ cr->reduce_refresh = 0;
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+ if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
+}
+
+// Set cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int num8x8bl = cm->MBs << 2;
+ int target_refresh = 0;
+ double weight_segment_target = 0;
+ double weight_segment = 0;
+ int thresh_low_motion = 20;
+ int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20,
+ rc->best_quality << 1);
+ int qp_max_thresh = 117 * MAXQ >> 7;
+ cr->apply_cyclic_refresh = 1;
+ if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 ||
+ is_lossless_requested(&cpi->oxcf) ||
+ rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+ (cpi->use_svc &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+ (!cpi->use_svc && cr->content_mode &&
+ rc->avg_frame_low_motion < thresh_low_motion &&
+ rc->frames_since_key > 40) ||
+ (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
+ rc->frames_since_key > 20) ||
+ (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+ rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
+ cr->percent_refresh = 10;
+ if (cr->reduce_refresh) cr->percent_refresh = 5;
+ cr->max_qdelta_perc = 60;
+ cr->time_for_refresh = 0;
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 15;
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+ // periods of the refresh cycle, after a key frame.
+ // Account for larger interval on base layer for temporal layers.
+ if (cr->percent_refresh > 0 &&
+ rc->frames_since_key <
+ (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) {
+ cr->rate_ratio_qdelta = 3.0;
+ } else {
+ cr->rate_ratio_qdelta = 2.0;
+ if (cr->content_mode && cpi->noise_estimate.enabled &&
+ cpi->noise_estimate.level >= kMedium) {
+ // Reduce the delta-qp if the estimated source noise is above threshold.
+ cr->rate_ratio_qdelta = 1.7;
+ cr->rate_boost_fac = 13;
+ }
+ }
+ // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and
+ // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2
+ // (rate_boost_fac = 10 disables segment#2).
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+ // Only enable feature of skipping flat_static blocks for top layer
+ // under screen content mode.
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cr->skip_flat_static_blocks = 1;
+ cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
+ // Increase the amount of refresh on scene change that is encoded at max Q,
+ // increase for a few cycles of the refresh period (~100 / percent_refresh).
+ if (cr->content_mode && cr->counter_encode_maxq_scene_change < 30)
+ cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
+ cr->rate_ratio_qdelta = 2.0;
+ cr->rate_boost_fac = 10;
+ }
+ // Adjust some parameters for low resolutions.
+ if (cm->width * cm->height <= 352 * 288) {
+ if (rc->avg_frame_bandwidth < 3000) {
+ cr->motion_thresh = 64;
+ cr->rate_boost_fac = 13;
+ } else {
+ cr->max_qdelta_perc = 70;
+ cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
+ }
+ }
+ if (cpi->oxcf.rc_mode == VPX_VBR) {
+ // To be adjusted for VBR mode, e.g., based on gf period and boost.
+ // For now use smaller qp-delta (than CBR), no second boosted seg, and
+ // turn-off (no refresh) on golden refresh (since it's already boosted).
+ cr->percent_refresh = 10;
+ cr->rate_ratio_qdelta = 1.5;
+ cr->rate_boost_fac = 10;
+ if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
+ cr->percent_refresh = 0;
+ cr->rate_ratio_qdelta = 1.0;
+ }
+ }
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ // Use the target if its less. To be used for setting the base qp for the
+ // frame in vp9_rc_regulate_q.
+ target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ weight_segment_target = (double)(target_refresh) / num8x8bl;
+ weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num8x8bl;
+ if (weight_segment_target < 7 * weight_segment / 8)
+ weight_segment = weight_segment_target;
+ // For screen-content: don't include target for the weight segment,
+ // since for all flat areas the segment is reset, so its more accurate
+ // to just use the previous actual number of seg blocks for the weight.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+ weight_segment =
+ (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
+ num8x8bl;
+ cr->weight_segment = weight_segment;
+ if (cr->content_mode == 0) {
+ cr->actual_num_seg1_blocks =
+ cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ cr->actual_num_seg2_blocks = 0;
+ cr->weight_segment = (double)(cr->actual_num_seg1_blocks) / num8x8bl;
+ }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ struct segmentation *const seg = &cm->seg;
+ int scene_change_detected =
+ cpi->rc.high_source_sad ||
+ (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+ if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
+ // Reset if resoluton change has occurred.
+ if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi);
+ if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) ||
+ scene_change_detected) {
+ // Set segmentation map to 0 and disable.
+ unsigned char *const seg_map = cpi->segmentation_map;
+ memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+ vp9_disable_segmentation(&cm->seg);
+ if (cm->frame_type == KEY_FRAME || scene_change_detected) {
+ memset(cr->last_coded_q_map, MAXQ,
+ cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+ cr->sb_index = 0;
+ cr->reduce_refresh = 0;
+ cr->counter_encode_maxq_scene_change = 0;
+ }
+ return;
+ } else {
+ int qindex_delta = 0;
+ int qindex2;
+ const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+ cr->counter_encode_maxq_scene_change++;
+ vpx_clear_system_state();
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+ // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+ // Set up segmentation.
+ // Clear down the segment map.
+ vp9_enable_segmentation(&cm->seg);
+ vp9_clearall_segfeatures(seg);
+ // Select delta coding method.
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ // Note: setting temporal_update has no effect, as the seg-map coding method
+ // (temporal or spatial) is determined in vp9_choose_segmap_coding_method(),
+ // based on the coding cost of each method. For error_resilient mode on the
+ // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+ // relative to 0 previous map.
+ // seg->temporal_update = 0;
+
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta[1] = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
+ qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+ cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
+
+ vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(
+ cpi, cm->base_qindex,
+ VPXMIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ cr->qindex_delta[2] = qindex_delta;
+ vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ cyclic_refresh_update_map(cpi);
+ }
+}
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+ return cr->rdmult;
+}
+
+void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cr->last_coded_q_map, MAXQ,
+ cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+ cr->sb_index = 0;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ cr->counter_encode_maxq_scene_change = 0;
+}
+
+void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // For now apply hard limit to frame-level decrease in q, if the cyclic
+ // refresh is active (percent_refresh > 0).
+ if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) {
+ *q = cpi->rc.q_1_frame - 8;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
new file mode 100644
index 0000000000..c74cee4743
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct CYCLIC_REFRESH {
+ // Percentage of blocks per frame that are targeted as candidates
+ // for cyclic refresh.
+ int percent_refresh;
+ // Maximum q-delta as percentage of base q.
+ int max_qdelta_perc;
+ // Superblock starting index for cycling through the frame.
+ int sb_index;
+ // Controls how long block will need to wait to be refreshed again, in
+ // excess of the cycle time, i.e., in the case of all zero motion, block
+ // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+ int time_for_refresh;
+ // Target number of (8x8) blocks that are set for delta-q.
+ int target_num_seg_blocks;
+ // Actual number of (8x8) blocks that were applied delta-q.
+ int actual_num_seg1_blocks;
+ int actual_num_seg2_blocks;
+ // RD mult. parameters for segment 1.
+ int rdmult;
+ // Cyclic refresh map.
+ signed char *map;
+ // Map of the last q a block was coded at.
+ uint8_t *last_coded_q_map;
+ // Thresholds applied to the projected rate/distortion of the coding block,
+ // when deciding whether block should be refreshed.
+ int64_t thresh_rate_sb;
+ int64_t thresh_dist_sb;
+ // Threshold applied to the motion vector (in units of 1/8 pel) of the
+ // coding block, when deciding whether block should be refreshed.
+ int16_t motion_thresh;
+ // Rate target ratio to set q delta.
+ double rate_ratio_qdelta;
+ // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+ int rate_boost_fac;
+ double low_content_avg;
+ int qindex_delta[3];
+ int reduce_refresh;
+ double weight_segment;
+ int apply_cyclic_refresh;
+ int counter_encode_maxq_scene_change;
+ int skip_flat_static_blocks;
+ int content_mode;
+};
+
+struct VP9_COMP;
+
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi,
+ double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
+ double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
+ MODE_INFO *const mi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip,
+ struct macroblock_plane *const p);
+
+void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
+ const MODE_INFO *const mi,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+// From the just encoded frame: update the actual number of blocks that were
+// applied the segment delta q, and the amount of low motion in the frame.
+// Also check conditions for forcing golden update, or preventing golden
+// update if the period is up.
+void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
+
+int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void vp9_cyclic_refresh_reset_resize(struct VP9_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
+void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c
new file mode 100644
index 0000000000..9e5f3bfb28
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/encoder/vp9_aq_variance.h"
+
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
+#if CONFIG_VP9_HIGHBITDEPTH
+DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = { 0 };
+#endif
+
+unsigned int vp9_vaq_segment_id(int energy) {
+ ENERGY_IN_BOUNDS(energy);
+ return SEGMENT_ID(energy);
+}
+
+void vp9_vaq_frame_setup(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame || cpi->force_update_segmentation ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ vpx_clear_system_state();
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ int qindex_delta =
+ vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+ rate_ratio[i], cm->bit_depth);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+
+ // No need to enable SEG_LVL_ALT_Q for this segment.
+ if (rate_ratio[i] == 1.0) {
+ continue;
+ }
+
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
+
+/* TODO(agrange, paulwilkins): The block_variance calls the unoptimized versions
+ * of variance() and highbd_8_variance(). It should not.
+ */
+static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, unsigned int *sse,
+ int *sum) {
+ int i, j;
+
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int w, int h,
+ uint64_t *sse, int64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var, sse;
+ int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ if (right_overflow || bottom_overflow) {
+ const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+ const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
+ int avg;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint64_t sse64 = 0;
+ int64_t sum64 = 0;
+ aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+ &sse64, &sum64);
+ sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8)));
+ avg = (int)(sum64 >> (xd->bd - 8));
+ } else {
+ aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0,
+ bw, bh, &sse, &avg);
+ }
+#else
+ aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0,
+ bw, bh, &sse, &avg);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ var = sse - (unsigned int)(((int64_t)avg * avg) / (bw * bh));
+ return (unsigned int)(((uint64_t)256 * var) / (bw * bh));
+ } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var =
+ cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, &sse);
+ } else {
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+ }
+#else
+ var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return (unsigned int)(((uint64_t)256 * var) >> num_pels_log2_lookup[bs]);
+ }
+}
+
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int var = block_variance(cpi, x, bs);
+ vpx_clear_system_state();
+ return log(var + 1.0);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+static int scale_block_energy(VP9_COMP *cpi, unsigned int block_var) {
+ double energy;
+ double energy_midpoint;
+ energy_midpoint =
+ (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+ energy = log(block_var + 1.0) - energy_midpoint;
+ return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+#undef DEFAULT_E_MIDPOINT
+
+// Get the range of sub block energy values;
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *min_e,
+ int *max_e) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ int x, y;
+
+ if (xmis < bw || ymis < bh) {
+ vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+ *min_e = vp9_block_energy(cpi, mb, bsize);
+ *max_e = *min_e;
+ } else {
+ unsigned int var;
+ // Because scale_block_energy is non-decreasing, we can find the min/max
+ // block variance and scale afterwards. This avoids a costly scaling at
+ // every iteration.
+ unsigned int min_var = UINT_MAX;
+ unsigned int max_var = 0;
+
+ for (y = 0; y < ymis; ++y) {
+ for (x = 0; x < xmis; ++x) {
+ vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x);
+ vpx_clear_system_state();
+ var = block_variance(cpi, mb, BLOCK_8X8);
+ vpx_clear_system_state();
+ min_var = VPXMIN(min_var, var);
+ max_var = VPXMAX(max_var, var);
+ }
+ }
+ *min_e = scale_block_energy(cpi, min_var);
+ *max_e = scale_block_energy(cpi, max_var);
+ }
+
+ // Re-instate source pointers back to what they should have been on entry.
+ vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+}
+
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int var;
+ vpx_clear_system_state();
+ var = block_variance(cpi, x, bs);
+ vpx_clear_system_state();
+ return scale_block_energy(cpi, var);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h
new file mode 100644
index 0000000000..a4f872879d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned int vp9_vaq_segment_id(int energy);
+void vp9_vaq_frame_setup(VP9_COMP *cpi);
+
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *min_e,
+ int *max_e);
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c
new file mode 100644
index 0000000000..ca56d14aa1
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.c
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx_dsp/bitwriter_buffer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem_ops.h"
+#include "vpx_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
+ { 0, 1 }, { 6, 3 }, { 28, 5 }, { 30, 5 }, { 58, 6 },
+ { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 }
+};
+static const struct vp9_token
+ switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 },
+ { 2, 2 },
+ { 3, 2 } };
+static const struct vp9_token partition_encodings[PARTITION_TYPES] = {
+ { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 }
+};
+static const struct vp9_token inter_mode_encodings[INTER_MODES] = {
+ { 2, 2 }, { 6, 3 }, { 0, 1 }, { 7, 3 }
+};
+
+static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+ const vpx_prob *probs) {
+ vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
+}
+
+static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
+ const vpx_prob *probs) {
+ assert(is_inter_mode(mode));
+ vp9_write_token(w, vp9_inter_mode_tree, probs,
+ &inter_mode_encodings[INTER_OFFSET(mode)]);
+}
+
+static void encode_unsigned_max(struct vpx_write_bit_buffer *wb, int data,
+ int max) {
+ vpx_wb_write_literal(wb, data, get_unsigned_bits(max));
+}
+
+static void prob_diff_update(const vpx_tree_index *tree,
+ vpx_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/], int n,
+ vpx_writer *w) {
+ int i;
+ unsigned int branch_ct[32][2];
+
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+
+ vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i)
+ vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
+}
+
+static void write_selected_tx_size(const VP9_COMMON *cm,
+ const MACROBLOCKD *const xd, vpx_writer *w) {
+ TX_SIZE tx_size = xd->mi[0]->tx_size;
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+ const vpx_prob *const tx_probs =
+ get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs);
+ vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
+ if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
+ vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
+ if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
+ vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
+ }
+}
+
+static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
+ int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip = mi->skip;
+ vpx_write(w, skip, vp9_get_skip_prob(cm, xd));
+ return skip;
+ }
+}
+
+static void update_skip_probs(VP9_COMMON *cm, vpx_writer *w,
+ FRAME_COUNTS *counts) {
+ int k;
+
+ for (k = 0; k < SKIP_CONTEXTS; ++k)
+ vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
+}
+
+static void update_switchable_interp_probs(VP9_COMMON *cm, vpx_writer *w,
+ FRAME_COUNTS *counts) {
+ int j;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ prob_diff_update(vp9_switchable_interp_tree,
+ cm->fc->switchable_interp_prob[j],
+ counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
+}
+
+static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
+ const TOKENEXTRA *const stop,
+ vpx_bit_depth_t bit_depth) {
+ const TOKENEXTRA *p;
+ const vp9_extra_bit *const extra_bits =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (bit_depth == VPX_BITS_12) ? vp9_extra_bits_high12
+ : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+ : vp9_extra_bits;
+#else
+ vp9_extra_bits;
+ (void)bit_depth;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ for (p = *tp; p < stop && p->token != EOSB_TOKEN; ++p) {
+ if (p->token == EOB_TOKEN) {
+ vpx_write(w, 0, p->context_tree[0]);
+ continue;
+ }
+ vpx_write(w, 1, p->context_tree[0]);
+ while (p->token == ZERO_TOKEN) {
+ vpx_write(w, 0, p->context_tree[1]);
+ ++p;
+ if (p == stop || p->token == EOSB_TOKEN) {
+ *tp = (TOKENEXTRA *)(uintptr_t)p + (p->token == EOSB_TOKEN);
+ return;
+ }
+ }
+
+ {
+ const int t = p->token;
+ const vpx_prob *const context_tree = p->context_tree;
+ assert(t != ZERO_TOKEN);
+ assert(t != EOB_TOKEN);
+ assert(t != EOSB_TOKEN);
+ vpx_write(w, 1, context_tree[1]);
+ if (t == ONE_TOKEN) {
+ vpx_write(w, 0, context_tree[2]);
+ vpx_write_bit(w, p->extra & 1);
+ } else { // t >= TWO_TOKEN && t < EOB_TOKEN
+ const struct vp9_token *const a = &vp9_coef_encodings[t];
+ int v = a->value;
+ int n = a->len;
+ const int e = p->extra;
+ vpx_write(w, 1, context_tree[2]);
+ vp9_write_tree(w, vp9_coef_con_tree,
+ vp9_pareto8_full[context_tree[PIVOT_NODE] - 1], v,
+ n - UNCONSTRAINED_NODES, 0);
+ if (t >= CATEGORY1_TOKEN) {
+ const vp9_extra_bit *const b = &extra_bits[t];
+ const unsigned char *pb = b->prob;
+ v = e >> 1;
+ n = b->len; // number of bits in v, assumed nonzero
+ do {
+ const int bb = (v >> --n) & 1;
+ vpx_write(w, bb, *pb++);
+ } while (n);
+ }
+ vpx_write_bit(w, e & 1);
+ }
+ }
+ }
+ *tp = (TOKENEXTRA *)(uintptr_t)p + (p->token == EOSB_TOKEN);
+}
+
+static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
+ int segment_id) {
+ if (seg->enabled && seg->update_map)
+ vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
+}
+
+// This function encodes the reference frame
+static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
+ vpx_writer *w) {
+ const MODE_INFO *const mi = xd->mi[0];
+ const int is_compound = has_second_ref(mi);
+ const int segment_id = mi->segment_id;
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ assert(!is_compound);
+ assert(mi->ref_frame[0] ==
+ get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+ } else {
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ vpx_write(w, is_compound, vp9_get_reference_mode_prob(cm, xd));
+ } else {
+ assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
+ }
+
+ if (is_compound) {
+ const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1],
+ vp9_get_pred_prob_comp_ref_p(cm, xd));
+ } else {
+ const int bit0 = mi->ref_frame[0] != LAST_FRAME;
+ vpx_write(w, bit0, vp9_get_pred_prob_single_ref_p1(cm, xd));
+ if (bit0) {
+ const int bit1 = mi->ref_frame[0] != GOLDEN_FRAME;
+ vpx_write(w, bit1, vp9_get_pred_prob_single_ref_p2(cm, xd));
+ }
+ }
+ }
+}
+
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
+ const MB_MODE_INFO_EXT *const mbmi_ext,
+ vpx_writer *w,
+ unsigned int *const max_mv_magnitude,
+ int interp_filter_selected[][SWITCHABLE]) {
+ VP9_COMMON *const cm = &cpi->common;
+ const nmv_context *nmvc = &cm->fc->nmvc;
+ const struct segmentation *const seg = &cm->seg;
+ const MODE_INFO *const mi = xd->mi[0];
+ const PREDICTION_MODE mode = mi->mode;
+ const int segment_id = mi->segment_id;
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ const int is_inter = is_inter_block(mi);
+ const int is_compound = has_second_ref(mi);
+ int skip, ref;
+
+ if (seg->update_map) {
+ if (seg->temporal_update) {
+ const int pred_flag = mi->seg_id_predicted;
+ vpx_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
+ vpx_write(w, pred_flag, pred_prob);
+ if (!pred_flag) write_segment_id(w, seg, segment_id);
+ } else {
+ write_segment_id(w, seg, segment_id);
+ }
+ }
+
+ skip = write_skip(cm, xd, segment_id, mi, w);
+
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+ vpx_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
+
+ if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+ !(is_inter && skip)) {
+ write_selected_tx_size(cm, xd, w);
+ }
+
+ if (!is_inter) {
+ if (bsize >= BLOCK_8X8) {
+ write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
+ } else {
+ int idx, idy;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
+ write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
+ }
+ }
+ }
+ write_intra_mode(w, mi->uv_mode, cm->fc->uv_mode_prob[mode]);
+ } else {
+ const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
+ const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+ write_ref_frames(cm, xd, w);
+
+ // If segment skip is not enabled code the mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (bsize >= BLOCK_8X8) {
+ write_inter_mode(w, mode, inter_probs);
+ }
+ }
+
+ if (cm->interp_filter == SWITCHABLE) {
+ const int ctx = get_pred_context_switchable_interp(xd);
+ vp9_write_token(w, vp9_switchable_interp_tree,
+ cm->fc->switchable_interp_prob[ctx],
+ &switchable_interp_encodings[mi->interp_filter]);
+ ++interp_filter_selected[0][mi->interp_filter];
+ } else {
+ assert(mi->interp_filter == cm->interp_filter);
+ }
+
+ if (bsize < BLOCK_8X8) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+ write_inter_mode(w, b_mode, inter_probs);
+ if (b_mode == NEWMV) {
+ for (ref = 0; ref < 1 + is_compound; ++ref)
+ vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+ &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv,
+ nmvc, allow_hp, max_mv_magnitude);
+ }
+ }
+ }
+ } else {
+ if (mode == NEWMV) {
+ for (ref = 0; ref < 1 + is_compound; ++ref)
+ vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv,
+ &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc,
+ allow_hp, max_mv_magnitude);
+ }
+ }
+ }
+}
+
+static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ vpx_writer *w) {
+ const struct segmentation *const seg = &cm->seg;
+ const MODE_INFO *const mi = xd->mi[0];
+ const MODE_INFO *const above_mi = xd->above_mi;
+ const MODE_INFO *const left_mi = xd->left_mi;
+ const BLOCK_SIZE bsize = mi->sb_type;
+
+ if (seg->update_map) write_segment_id(w, seg, mi->segment_id);
+
+ write_skip(cm, xd, mi->segment_id, mi, w);
+
+ if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
+ write_selected_tx_size(cm, xd, w);
+
+ if (bsize >= BLOCK_8X8) {
+ write_intra_mode(w, mi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int block = idy * 2 + idx;
+ write_intra_mode(w, mi->bmi[block].as_mode,
+ get_y_mode_probs(mi, above_mi, left_mi, block));
+ }
+ }
+ }
+
+ write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
+}
+
+static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
+ const TileInfo *const tile, vpx_writer *w,
+ TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+ int mi_row, int mi_col,
+ unsigned int *const max_mv_magnitude,
+ int interp_filter_selected[][SWITCHABLE]) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const MB_MODE_INFO_EXT *const mbmi_ext =
+ cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+ MODE_INFO *m;
+
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ m = xd->mi[0];
+
+ set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type],
+ mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows,
+ cm->mi_cols);
+ if (frame_is_intra_only(cm)) {
+ write_mb_modes_kf(cm, xd, w);
+ } else {
+ pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude,
+ interp_filter_selected);
+ }
+
+ assert(*tok < tok_end);
+ pack_mb_tokens(w, tok, tok_end, cm->bit_depth);
+}
+
+static void write_partition(const VP9_COMMON *const cm,
+ const MACROBLOCKD *const xd, int hbs, int mi_row,
+ int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
+ vpx_writer *w) {
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ const vpx_prob *const probs = xd->partition_probs[ctx];
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+ if (has_rows && has_cols) {
+ vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ vpx_write(w, p == PARTITION_SPLIT, probs[1]);
+ } else if (has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ vpx_write(w, p == PARTITION_SPLIT, probs[2]);
+ } else {
+ assert(p == PARTITION_SPLIT);
+ }
+}
+
+static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
+ const TileInfo *const tile, vpx_writer *w,
+ TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ unsigned int *const max_mv_magnitude,
+ int interp_filter_selected[][SWITCHABLE]) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) / 4;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+ const MODE_INFO *m = NULL;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+
+ partition = partition_lookup[bsl][m->sb_type];
+ write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
+ subsize = get_subsize(bsize, partition);
+ if (subsize < BLOCK_8X8) {
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+ max_mv_magnitude, interp_filter_selected);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+ max_mv_magnitude, interp_filter_selected);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+ max_mv_magnitude, interp_filter_selected);
+ if (mi_row + bs < cm->mi_rows)
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+ max_mv_magnitude, interp_filter_selected);
+ break;
+ case PARTITION_VERT:
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col,
+ max_mv_magnitude, interp_filter_selected);
+ if (mi_col + bs < cm->mi_cols)
+ write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+ max_mv_magnitude, interp_filter_selected);
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
+ max_mv_magnitude, interp_filter_selected);
+ write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
+ subsize, max_mv_magnitude, interp_filter_selected);
+ write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col,
+ subsize, max_mv_magnitude, interp_filter_selected);
+ write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+ subsize, max_mv_magnitude, interp_filter_selected);
+ break;
+ }
+ }
+
+ // update partition context
+ if (bsize >= BLOCK_8X8 &&
+ (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
+ const TileInfo *const tile, vpx_writer *w, int tile_row,
+ int tile_col, unsigned int *const max_mv_magnitude,
+ int interp_filter_selected[][SWITCHABLE]) {
+ const VP9_COMMON *const cm = &cpi->common;
+ int mi_row, mi_col, tile_sb_row;
+ TOKENEXTRA *tok = NULL;
+ TOKENEXTRA *tok_end = NULL;
+
+ set_partition_probs(cm, xd);
+
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile->mi_row_start) >>
+ MI_BLOCK_SIZE_LOG2;
+ tok = cpi->tplist[tile_row][tile_col][tile_sb_row].start;
+ tok_end = tok + cpi->tplist[tile_row][tile_col][tile_sb_row].count;
+
+ vp9_zero(xd->left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ mi_col += MI_BLOCK_SIZE)
+ write_modes_sb(cpi, xd, tile, w, &tok, tok_end, mi_row, mi_col,
+ BLOCK_64X64, max_mv_magnitude, interp_filter_selected);
+
+ assert(tok == cpi->tplist[tile_row][tile_col][tile_sb_row].stop);
+ }
+}
+
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
+ vp9_coeff_stats *coef_branch_ct,
+ vp9_coeff_probs_model *coef_probs) {
+ vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
+ unsigned int(*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+ cpi->common.counts.eob_branch[tx_size];
+ int i, j, k, l, m;
+
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ vp9_tree_probs_from_distribution(vp9_coef_tree,
+ coef_branch_ct[i][j][k][l],
+ coef_counts[i][j][k][l]);
+ coef_branch_ct[i][j][k][l][0][1] =
+ eob_branch_ct[i][j][k][l] - coef_branch_ct[i][j][k][l][0][0];
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ coef_probs[i][j][k][l][m] =
+ get_binary_prob(coef_branch_ct[i][j][k][l][m][0],
+ coef_branch_ct[i][j][k][l][m][1]);
+ }
+ }
+ }
+ }
+}
+
+static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
+ TX_SIZE tx_size,
+ vp9_coeff_stats *frame_branch_ct,
+ vp9_coeff_probs_model *new_coef_probs) {
+ vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+ const vpx_prob upd = DIFF_UPDATE_PROB;
+ const int entropy_nodes_update = UNCONSTRAINED_NODES;
+ int i, j, k, l, t;
+ int stepsize = cpi->sf.coeff_prob_appx_step;
+
+ switch (cpi->sf.use_fast_coef_updates) {
+ case TWO_LOOP: {
+ /* dry run to see if there is any update at all needed */
+ int64_t savings = 0;
+ int update[2] = { 0, 0 };
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vpx_prob newp = new_coef_probs[i][j][k][l][t];
+ const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
+ int64_t s;
+ int u = 0;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], oldp, &newp, upd,
+ stepsize);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
+ if (s > 0 && newp != oldp) u = 1;
+ if (u)
+ savings += s - (int)(vp9_cost_zero(upd));
+ else
+ savings -= (int)(vp9_cost_zero(upd));
+ update[u]++;
+ }
+ }
+ }
+ }
+ }
+
+ // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
+ /* Is coef updated at all */
+ if (update[1] == 0 || savings < 0) {
+ vpx_write_bit(bc, 0);
+ return;
+ }
+ vpx_write_bit(bc, 1);
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vpx_prob newp = new_coef_probs[i][j][k][l][t];
+ vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int64_t s;
+ int u = 0;
+ if (t == PIVOT_NODE)
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+ stepsize);
+ else
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+ if (s > 0 && newp != *oldp) u = 1;
+ vpx_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ return;
+ }
+
+ default: {
+ int updates = 0;
+ int noupdates_before_first = 0;
+ assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED);
+ for (i = 0; i < PLANE_TYPES; ++i) {
+ for (j = 0; j < REF_TYPES; ++j) {
+ for (k = 0; k < COEF_BANDS; ++k) {
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ // calc probs and branch cts for this frame only
+ for (t = 0; t < entropy_nodes_update; ++t) {
+ vpx_prob newp = new_coef_probs[i][j][k][l][t];
+ vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+ int64_t s;
+ int u = 0;
+
+ if (t == PIVOT_NODE) {
+ s = vp9_prob_diff_update_savings_search_model(
+ frame_branch_ct[i][j][k][l][0], *oldp, &newp, upd,
+ stepsize);
+ } else {
+ s = vp9_prob_diff_update_savings_search(
+ frame_branch_ct[i][j][k][l][t], *oldp, &newp, upd);
+ }
+
+ if (s > 0 && newp != *oldp) u = 1;
+ updates += u;
+ if (u == 0 && updates == 0) {
+ noupdates_before_first++;
+ continue;
+ }
+ if (u == 1 && updates == 1) {
+ int v;
+ // first update
+ vpx_write_bit(bc, 1);
+ for (v = 0; v < noupdates_before_first; ++v)
+ vpx_write(bc, 0, upd);
+ }
+ vpx_write(bc, u, upd);
+ if (u) {
+ /* send/use new probability */
+ vp9_write_prob_diff_update(bc, newp, *oldp);
+ *oldp = newp;
+ }
+ }
+ }
+ }
+ }
+ }
+ if (updates == 0) {
+ vpx_write_bit(bc, 0); // no updates
+ }
+ return;
+ }
+ }
+}
+
+static void update_coef_probs(VP9_COMP *cpi, vpx_writer *w) {
+ const TX_MODE tx_mode = cpi->common.tx_mode;
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+ vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+ vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+ if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+ (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+ vpx_write_bit(w, 0);
+ } else {
+ build_tree_distribution(cpi, tx_size, frame_branch_ct, frame_coef_probs);
+ update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+ frame_coef_probs);
+ }
+ }
+}
+
+static void encode_loopfilter(struct loopfilter *lf,
+ struct vpx_write_bit_buffer *wb) {
+ int i;
+
+ // Encode the loop filter level and type
+ vpx_wb_write_literal(wb, lf->filter_level, 6);
+ vpx_wb_write_literal(wb, lf->sharpness_level, 3);
+
+ // Write out loop filter deltas applied at the MB level based on mode or
+ // ref frame (if they are enabled).
+ vpx_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+ if (lf->mode_ref_delta_enabled) {
+ vpx_wb_write_bit(wb, lf->mode_ref_delta_update);
+ if (lf->mode_ref_delta_update) {
+ for (i = 0; i < MAX_REF_LF_DELTAS; i++) {
+ const int delta = lf->ref_deltas[i];
+ const int changed = delta != lf->last_ref_deltas[i];
+ vpx_wb_write_bit(wb, changed);
+ if (changed) {
+ lf->last_ref_deltas[i] = delta;
+ vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+ vpx_wb_write_bit(wb, delta < 0);
+ }
+ }
+
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ const int delta = lf->mode_deltas[i];
+ const int changed = delta != lf->last_mode_deltas[i];
+ vpx_wb_write_bit(wb, changed);
+ if (changed) {
+ lf->last_mode_deltas[i] = delta;
+ vpx_wb_write_literal(wb, abs(delta) & 0x3F, 6);
+ vpx_wb_write_bit(wb, delta < 0);
+ }
+ }
+ }
+ }
+}
+
+static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
+ if (delta_q != 0) {
+ vpx_wb_write_bit(wb, 1);
+ vpx_wb_write_literal(wb, abs(delta_q), 4);
+ vpx_wb_write_bit(wb, delta_q < 0);
+ } else {
+ vpx_wb_write_bit(wb, 0);
+ }
+}
+
+static void encode_quantization(const VP9_COMMON *const cm,
+ struct vpx_write_bit_buffer *wb) {
+ vpx_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+ write_delta_q(wb, cm->y_dc_delta_q);
+ write_delta_q(wb, cm->uv_dc_delta_q);
+ write_delta_q(wb, cm->uv_ac_delta_q);
+}
+
+static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
+ struct vpx_write_bit_buffer *wb) {
+ int i, j;
+
+ const struct segmentation *seg = &cm->seg;
+
+ vpx_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled) return;
+
+ // Segmentation map
+ vpx_wb_write_bit(wb, seg->update_map);
+ if (seg->update_map) {
+ // Select the coding strategy (temporal or spatial)
+ vp9_choose_segmap_coding_method(cm, xd);
+ // Write out probabilities used to decode unpredicted macro-block segments
+ for (i = 0; i < SEG_TREE_PROBS; i++) {
+ const int prob = seg->tree_probs[i];
+ const int update = prob != MAX_PROB;
+ vpx_wb_write_bit(wb, update);
+ if (update) vpx_wb_write_literal(wb, prob, 8);
+ }
+
+ // Write out the chosen coding method.
+ vpx_wb_write_bit(wb, seg->temporal_update);
+ if (seg->temporal_update) {
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ const int prob = seg->pred_probs[i];
+ const int update = prob != MAX_PROB;
+ vpx_wb_write_bit(wb, update);
+ if (update) vpx_wb_write_literal(wb, prob, 8);
+ }
+ }
+ }
+
+ // Segmentation data
+ vpx_wb_write_bit(wb, seg->update_data);
+ if (seg->update_data) {
+ vpx_wb_write_bit(wb, seg->abs_delta);
+
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ const int active = segfeature_active(seg, i, j);
+ vpx_wb_write_bit(wb, active);
+ if (active) {
+ const int data = get_segdata(seg, i, j);
+ const int data_max = vp9_seg_feature_data_max(j);
+
+ if (vp9_is_segfeature_signed(j)) {
+ encode_unsigned_max(wb, abs(data), data_max);
+ vpx_wb_write_bit(wb, data < 0);
+ } else {
+ encode_unsigned_max(wb, data, data_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+static void encode_txfm_probs(VP9_COMMON *cm, vpx_writer *w,
+ FRAME_COUNTS *counts) {
+ // Mode
+ vpx_write_literal(w, VPXMIN(cm->tx_mode, ALLOW_32X32), 2);
+ if (cm->tx_mode >= ALLOW_32X32)
+ vpx_write_bit(w, cm->tx_mode == TX_MODE_SELECT);
+
+ // Probabilities
+ if (cm->tx_mode == TX_MODE_SELECT) {
+ int i, j;
+ unsigned int ct_8x8p[TX_SIZES - 3][2];
+ unsigned int ct_16x16p[TX_SIZES - 2][2];
+ unsigned int ct_32x32p[TX_SIZES - 1][2];
+
+ for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+ tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
+ for (j = 0; j < TX_SIZES - 3; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
+ }
+
+ for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+ tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
+ for (j = 0; j < TX_SIZES - 2; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
+ ct_16x16p[j]);
+ }
+
+ for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+ tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
+ for (j = 0; j < TX_SIZES - 1; j++)
+ vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
+ ct_32x32p[j]);
+ }
+ }
+}
+
+static void write_interp_filter(INTERP_FILTER filter,
+ struct vpx_write_bit_buffer *wb) {
+ const int filter_to_literal[] = { 1, 0, 2, 3 };
+
+ vpx_wb_write_bit(wb, filter == SWITCHABLE);
+ if (filter != SWITCHABLE)
+ vpx_wb_write_literal(wb, filter_to_literal[filter], 2);
+}
+
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
+ if (cm->interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS];
+ int i, j, c = 0;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ count[i] = 0;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ c += (count[i] > 0);
+ }
+ if (c == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ cm->interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void write_tile_info(const VP9_COMMON *const cm,
+ struct vpx_write_bit_buffer *wb) {
+ int min_log2_tile_cols, max_log2_tile_cols, ones;
+ vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+ // columns
+ ones = cm->log2_tile_cols - min_log2_tile_cols;
+ while (ones--) vpx_wb_write_bit(wb, 1);
+
+ if (cm->log2_tile_cols < max_log2_tile_cols) vpx_wb_write_bit(wb, 0);
+
+ // rows
+ vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
+ if (cm->log2_tile_rows != 0) vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
+}
+
+int vp9_get_refresh_mask(VP9_COMP *cpi) {
+ if (vp9_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term we leave it in the GF slot and,
+ // if we're updating the GF with the current decoded frame, we save it
+ // instead to the ARF slot.
+ // Later, in the function vp9_encoder.c:vp9_update_reference_frames() we
+ // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+ // there so that it can be done outside of the recode loop.
+ // Note: This is highly specific to the use of ARF as a forward reference,
+ // and this needs to be generalized as other uses are implemented
+ // (like RTC/temporal scalability).
+ return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+ (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+ } else {
+ int arf_idx = cpi->alt_fb_idx;
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+ if (cpi->multi_layer_arf) {
+ for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) {
+ if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx &&
+ arf_idx != cpi->gld_fb_idx) {
+ int idx;
+ for (idx = 0; idx < gf_group->stack_size; ++idx)
+ if (arf_idx == gf_group->arf_index_stack[idx]) break;
+ if (idx == gf_group->stack_size) break;
+ }
+ }
+ }
+ cpi->twopass.gf_group.top_arf_idx = arf_idx;
+
+ if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+ cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+ return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id];
+ return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+ (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
+ (cpi->refresh_alt_ref_frame << arf_idx);
+ }
+}
+
+static int encode_tile_worker(void *arg1, void *arg2) {
+ VP9_COMP *cpi = (VP9_COMP *)arg1;
+ VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2;
+ MACROBLOCKD *const xd = &data->xd;
+ const int tile_row = 0;
+ vpx_start_encode(&data->bit_writer, data->dest);
+ write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info,
+ &data->bit_writer, tile_row, data->tile_idx,
+ &data->max_mv_magnitude, data->interp_filter_selected);
+ vpx_stop_encode(&data->bit_writer);
+ return 1;
+}
+
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) {
+ if (cpi->vp9_bitstream_worker_data) {
+ int i;
+ for (i = 1; i < cpi->num_workers; ++i) {
+ vpx_free(cpi->vp9_bitstream_worker_data[i].dest);
+ }
+ vpx_free(cpi->vp9_bitstream_worker_data);
+ cpi->vp9_bitstream_worker_data = NULL;
+ }
+}
+
+static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int i;
+ const size_t worker_data_size =
+ cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
+ vpx_memalign(16, worker_data_size));
+ memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
+ for (i = 1; i < cpi->num_workers; ++i) {
+ cpi->vp9_bitstream_worker_data[i].dest_size =
+ cpi->oxcf.width * cpi->oxcf.height;
+ CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
+ vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
+ }
+}
+
+static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int num_workers = cpi->num_workers;
+ size_t total_size = 0;
+ int tile_col = 0;
+
+ if (!cpi->vp9_bitstream_worker_data ||
+ cpi->vp9_bitstream_worker_data[1].dest_size >
+ (cpi->oxcf.width * cpi->oxcf.height)) {
+ vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+ encode_tiles_buffer_alloc(cpi);
+ }
+
+ while (tile_col < tile_cols) {
+ int i, j;
+ for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+ VPxWorker *const worker = &cpi->workers[i];
+ VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i];
+
+ // Populate the worker data.
+ data->xd = cpi->td.mb.e_mbd;
+ data->tile_idx = tile_col;
+ data->max_mv_magnitude = cpi->max_mv_magnitude;
+ memset(data->interp_filter_selected, 0,
+ sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE);
+
+ // First thread can directly write into the output buffer.
+ if (i == 0) {
+ // If this worker happens to be for the last tile, then do not offset it
+ // by 4 for the tile size.
+ data->dest =
+ data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4);
+ }
+ worker->data1 = cpi;
+ worker->data2 = data;
+ worker->hook = encode_tile_worker;
+ worker->had_error = 0;
+
+ if (i < num_workers - 1) {
+ winterface->launch(worker);
+ } else {
+ winterface->execute(worker);
+ }
+ ++tile_col;
+ }
+ for (j = 0; j < i; ++j) {
+ VPxWorker *const worker = &cpi->workers[j];
+ VP9BitstreamWorkerData *const data =
+ (VP9BitstreamWorkerData *)worker->data2;
+ uint32_t tile_size;
+ int k;
+
+ if (!winterface->sync(worker)) return 0;
+ tile_size = data->bit_writer.pos;
+
+ // Aggregate per-thread bitstream stats.
+ cpi->max_mv_magnitude =
+ VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude);
+ for (k = 0; k < SWITCHABLE; ++k) {
+ cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k];
+ }
+
+ // Prefix the size of the tile on all but the last.
+ if (tile_col != tile_cols || j < i - 1) {
+ mem_put_be32(data_ptr + total_size, tile_size);
+ total_size += 4;
+ }
+ if (j > 0) {
+ memcpy(data_ptr + total_size, data->dest, tile_size);
+ }
+ total_size += tile_size;
+ }
+ }
+ return total_size;
+}
+
+static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ vpx_writer residual_bc;
+ int tile_row, tile_col;
+ size_t total_size = 0;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+
+ memset(cm->above_seg_context, 0,
+ sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+
+ // Encoding tiles in parallel is done only for realtime mode now. In other
+ // modes the speed up is insignificant and requires further testing to ensure
+ // that it does not make the overall process worse in any case.
+ if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 &&
+ tile_cols > 1) {
+ return encode_tiles_mt(cpi, data_ptr);
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ int tile_idx = tile_row * tile_cols + tile_col;
+
+ if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
+ vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
+ else
+ vpx_start_encode(&residual_bc, data_ptr + total_size);
+
+ write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc,
+ tile_row, tile_col, &cpi->max_mv_magnitude,
+ cpi->interp_filter_selected);
+
+ vpx_stop_encode(&residual_bc);
+ if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
+ // size of this tile
+ mem_put_be32(data_ptr + total_size, residual_bc.pos);
+ total_size += 4;
+ }
+
+ total_size += residual_bc.pos;
+ }
+ }
+ return total_size;
+}
+
+static void write_render_size(const VP9_COMMON *cm,
+ struct vpx_write_bit_buffer *wb) {
+ const int scaling_active =
+ cm->width != cm->render_width || cm->height != cm->render_height;
+ vpx_wb_write_bit(wb, scaling_active);
+ if (scaling_active) {
+ vpx_wb_write_literal(wb, cm->render_width - 1, 16);
+ vpx_wb_write_literal(wb, cm->render_height - 1, 16);
+ }
+}
+
+static void write_frame_size(const VP9_COMMON *cm,
+ struct vpx_write_bit_buffer *wb) {
+ vpx_wb_write_literal(wb, cm->width - 1, 16);
+ vpx_wb_write_literal(wb, cm->height - 1, 16);
+
+ write_render_size(cm, wb);
+}
+
+static void write_frame_size_with_refs(VP9_COMP *cpi,
+ struct vpx_write_bit_buffer *wb) {
+ VP9_COMMON *const cm = &cpi->common;
+ int found = 0;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+ // Set "found" to 0 for temporal svc and for spatial svc key frame
+ if (cpi->use_svc &&
+ ((cpi->svc.number_temporal_layers > 1 &&
+ cpi->oxcf.rc_mode == VPX_CBR) ||
+ (cpi->svc.number_spatial_layers > 1 &&
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
+ found = 0;
+ } else if (cfg != NULL) {
+ found =
+ cm->width == cfg->y_crop_width && cm->height == cfg->y_crop_height;
+ }
+ vpx_wb_write_bit(wb, found);
+ if (found) {
+ break;
+ }
+ }
+
+ if (!found) {
+ vpx_wb_write_literal(wb, cm->width - 1, 16);
+ vpx_wb_write_literal(wb, cm->height - 1, 16);
+ }
+
+ write_render_size(cm, wb);
+}
+
+static void write_sync_code(struct vpx_write_bit_buffer *wb) {
+ vpx_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+ vpx_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+ vpx_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+ struct vpx_write_bit_buffer *wb) {
+ switch (profile) {
+ case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break;
+ case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break;
+ case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break;
+ default:
+ assert(profile == PROFILE_3);
+ vpx_wb_write_literal(wb, 6, 3);
+ break;
+ }
+}
+
+static void write_bitdepth_colorspace_sampling(
+ VP9_COMMON *const cm, struct vpx_write_bit_buffer *wb) {
+ if (cm->profile >= PROFILE_2) {
+ assert(cm->bit_depth > VPX_BITS_8);
+ vpx_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
+ }
+ vpx_wb_write_literal(wb, cm->color_space, 3);
+ if (cm->color_space != VPX_CS_SRGB) {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ vpx_wb_write_bit(wb, cm->color_range);
+ if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
+ assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
+ vpx_wb_write_bit(wb, cm->subsampling_x);
+ vpx_wb_write_bit(wb, cm->subsampling_y);
+ vpx_wb_write_bit(wb, 0); // unused
+ } else {
+ assert(cm->subsampling_x == 1 && cm->subsampling_y == 1);
+ }
+ } else {
+ assert(cm->profile == PROFILE_1 || cm->profile == PROFILE_3);
+ vpx_wb_write_bit(wb, 0); // unused
+ }
+}
+
+static void write_uncompressed_header(VP9_COMP *cpi,
+ struct vpx_write_bit_buffer *wb) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ vpx_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
+
+ write_profile(cm->profile, wb);
+
+ // If to use show existing frame.
+ vpx_wb_write_bit(wb, cm->show_existing_frame);
+ if (cm->show_existing_frame) {
+ vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3);
+ return;
+ }
+
+ vpx_wb_write_bit(wb, cm->frame_type);
+ vpx_wb_write_bit(wb, cm->show_frame);
+ vpx_wb_write_bit(wb, cm->error_resilient_mode);
+
+ if (cm->frame_type == KEY_FRAME) {
+ write_sync_code(wb);
+ write_bitdepth_colorspace_sampling(cm, wb);
+ write_frame_size(cm, wb);
+ } else {
+ if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only);
+
+ if (!cm->error_resilient_mode)
+ vpx_wb_write_literal(wb, cm->reset_frame_context, 2);
+
+ if (cm->intra_only) {
+ write_sync_code(wb);
+
+ // Note for profile 0, 420 8bpp is assumed.
+ if (cm->profile > PROFILE_0) {
+ write_bitdepth_colorspace_sampling(cm, wb);
+ }
+
+ vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
+ write_frame_size(cm, wb);
+ } else {
+ MV_REFERENCE_FRAME ref_frame;
+ vpx_wb_write_literal(wb, vp9_get_refresh_mask(cpi), REF_FRAMES);
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+ vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+ REF_FRAMES_LOG2);
+ vpx_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
+ }
+
+ write_frame_size_with_refs(cpi, wb);
+
+ vpx_wb_write_bit(wb, cm->allow_high_precision_mv);
+
+ fix_interp_filter(cm, cpi->td.counts);
+ write_interp_filter(cm->interp_filter, wb);
+ }
+ }
+
+ if (!cm->error_resilient_mode) {
+ vpx_wb_write_bit(wb, cm->refresh_frame_context);
+ vpx_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
+ }
+
+ vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
+
+ encode_loopfilter(&cm->lf, wb);
+ encode_quantization(cm, wb);
+ encode_segmentation(cm, xd, wb);
+
+ write_tile_info(cm, wb);
+}
+
+static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ FRAME_CONTEXT *const fc = cm->fc;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ vpx_writer header_bc;
+
+ vpx_start_encode(&header_bc, data);
+
+ if (xd->lossless)
+ cm->tx_mode = ONLY_4X4;
+ else
+ encode_txfm_probs(cm, &header_bc, counts);
+
+ update_coef_probs(cpi, &header_bc);
+ update_skip_probs(cm, &header_bc, counts);
+
+ if (!frame_is_intra_only(cm)) {
+ int i;
+
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
+ counts->inter_mode[i], INTER_MODES, &header_bc);
+
+ if (cm->interp_filter == SWITCHABLE)
+ update_switchable_interp_probs(cm, &header_bc, counts);
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+ vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
+ counts->intra_inter[i]);
+
+ if (cpi->allow_comp_inter_inter) {
+ const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
+ const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+
+ vpx_write_bit(&header_bc, use_compound_pred);
+ if (use_compound_pred) {
+ vpx_write_bit(&header_bc, use_hybrid_pred);
+ if (use_hybrid_pred)
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+ vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
+ counts->comp_inter[i]);
+ }
+ }
+
+ if (cm->reference_mode != COMPOUND_REFERENCE) {
+ for (i = 0; i < REF_CONTEXTS; i++) {
+ vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
+ counts->single_ref[i][0]);
+ vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
+ counts->single_ref[i][1]);
+ }
+ }
+
+ if (cm->reference_mode != SINGLE_REFERENCE)
+ for (i = 0; i < REF_CONTEXTS; i++)
+ vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
+ counts->comp_ref[i]);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
+ counts->y_mode[i], INTRA_MODES, &header_bc);
+
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
+ counts->partition[i], PARTITION_TYPES, &header_bc);
+
+ vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+ &counts->mv);
+ }
+
+ vpx_stop_encode(&header_bc);
+ assert(header_bc.pos <= 0xffff);
+
+ return header_bc.pos;
+}
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
+ uint8_t *data = dest;
+ size_t first_part_size, uncompressed_hdr_size;
+ struct vpx_write_bit_buffer wb = { data, 0 };
+ struct vpx_write_bit_buffer saved_wb;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_reset_write();
+#endif
+
+ write_uncompressed_header(cpi, &wb);
+
+ // Skip the rest coding process if use show existing frame.
+ if (cpi->common.show_existing_frame) {
+ uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+ data += uncompressed_hdr_size;
+ *size = data - dest;
+ return;
+ }
+
+ saved_wb = wb;
+ vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size
+
+ uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+ data += uncompressed_hdr_size;
+
+ vpx_clear_system_state();
+
+ first_part_size = write_compressed_header(cpi, data);
+ data += first_part_size;
+ // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
+ vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+
+ data += encode_tiles(cpi, data);
+
+ *size = data - dest;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h
new file mode 100644
index 0000000000..208651dc22
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_bitstream.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_
+#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/encoder/vp9_encoder.h"
+
+typedef struct VP9BitstreamWorkerData {
+ uint8_t *dest;
+ int dest_size;
+ vpx_writer bit_writer;
+ int tile_idx;
+ unsigned int max_mv_magnitude;
+ // The size of interp_filter_selected in VP9_COMP is actually
+ // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do
+ // is increment the very first index (index 0) for the first dimension. Hence
+ // this is sufficient.
+ int interp_filter_selected[1][SWITCHABLE];
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+} VP9BitstreamWorkerData;
+
+int vp9_get_refresh_mask(VP9_COMP *cpi);
+
+void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
+
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
+
+static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
+ return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref &&
+ !cpi->use_svc;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_BITSTREAM_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_block.h b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
new file mode 100644
index 0000000000..4d336f2a42
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_block.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
+#define VPX_VP9_ENCODER_VP9_BLOCK_H_
+
+#include "vpx_util/vpx_thread.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ unsigned int sse;
+ int sum;
+ unsigned int var;
+} Diff;
+
+struct macroblock_plane {
+ DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+ tran_low_t *qcoeff;
+ tran_low_t *coeff;
+ uint16_t *eobs;
+ struct buf_2d src;
+
+ // Quantizer setings
+ int16_t *round_fp;
+ int16_t *quant_fp;
+ int16_t *quant;
+ int16_t *quant_shift;
+ int16_t *zbin;
+ int16_t *round;
+
+ int64_t quant_thred[2];
+};
+
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+ [COEFF_CONTEXTS][ENTROPY_TOKENS];
+
+typedef struct {
+ int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+ uint8_t mode_context[MAX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} MvLimits;
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if defined(_MSC_VER) && _MSC_VER < 1900
+ int64_t bsse[MAX_MB_PLANE << 2];
+#endif
+
+ struct macroblock_plane plane[MAX_MB_PLANE];
+
+ MACROBLOCKD e_mbd;
+ MB_MODE_INFO_EXT *mbmi_ext;
+ MB_MODE_INFO_EXT *mbmi_ext_base;
+ int skip_block;
+ int select_tx_size;
+ int skip_recode;
+ int skip_optimize;
+ int q_index;
+ double log_block_src_var;
+ int block_tx_domain;
+
+ // The equivalent error at the current rdmult of one whole bit (not one
+ // bitcost unit).
+ int errorperbit;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for large blocks.
+ int sadperbit16;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for sub-8x8 blocks.
+ int sadperbit4;
+ int rddiv;
+ int rdmult;
+ int cb_rdmult;
+ int segment_id;
+ int mb_energy;
+
+ // These are set to their default values at the beginning, and then adjusted
+ // further in the encoding process.
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
+
+ int mv_best_ref_index[MAX_REF_FRAMES];
+ unsigned int max_mv_context[MAX_REF_FRAMES];
+ unsigned int source_variance;
+ unsigned int pred_sse[MAX_REF_FRAMES];
+ int pred_mv_sad[MAX_REF_FRAMES];
+
+ int nmvjointcost[MV_JOINTS];
+ int *nmvcost[2];
+ int *nmvcost_hp[2];
+ int **mvcost;
+
+ int nmvjointsadcost[MV_JOINTS];
+ int *nmvsadcost[2];
+ int *nmvsadcost_hp[2];
+ int **mvsadcost;
+
+ // sharpness is used to disable skip mode and change rd_mult
+ int sharpness;
+
+ // aq mode is used to adjust rd based on segment.
+ int adjust_rdmult_by_segment;
+
+ // These define limits to motion vector components to prevent them
+ // from extending outside the UMV borders
+ MvLimits mv_limits;
+
+ // Notes transform blocks where no coefficents are coded.
+ // Set during mode selection. Read during block encoding.
+ uint8_t zcoeff_blk[TX_SIZES][256];
+
+ // Accumulate the tx block eobs in a partition block.
+ int32_t sum_y_eobs[TX_SIZES];
+
+ int skip;
+
+ int encode_breakout;
+
+ // note that token_costs is the cost when eob node is skipped
+ vp9_coeff_cost token_costs[TX_SIZES];
+
+ int optimize;
+
+ // indicate if it is in the rd search loop or encoding process
+ int use_lp32x32fdct;
+ int skip_encode;
+
+ // In first pass, intra prediction is done based on source pixels
+ // at tile boundaries
+ int fp_src_pred;
+
+ // use fast quantization process
+ int quant_fp;
+
+ // skip forward transform and quantization
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
+#define SKIP_TXFM_NONE 0
+// TODO(chengchen): consider remove SKIP_TXFM_AC_DC from vp9 completely
+// since it increases risks of bad perceptual quality.
+// https://crbug.com/webm/1729
+#define SKIP_TXFM_AC_DC 1
+#define SKIP_TXFM_AC_ONLY 2
+
+// cf. https://bugs.chromium.org/p/webm/issues/detail?id=1054
+#if !defined(_MSC_VER) || _MSC_VER >= 1900
+ int64_t bsse[MAX_MB_PLANE << 2];
+#endif
+
+ // Used to store sub partition's choices.
+ MV pred_mv[MAX_REF_FRAMES];
+
+ // Strong color activity detection. Used in RTC coding mode to enhance
+ // the visual quality at the boundary of moving color objects.
+ uint8_t color_sensitivity[2];
+
+ uint8_t sb_is_skin;
+
+ uint8_t skip_low_source_sad;
+
+ uint8_t lowvar_highsumdiff;
+
+ uint8_t last_sb_high_content;
+
+ int sb_use_mv_part;
+
+ int sb_mvcol_part;
+
+ int sb_mvrow_part;
+
+ int sb_pickmode_part;
+
+ int zero_temp_sad_source;
+
+ // For each superblock: saves the content value (e.g., low/high sad/sumdiff)
+ // based on source sad, prior to encoding the frame.
+ uint8_t content_state_sb;
+
+ // Used to save the status of whether a block has a low variance in
+ // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+ // 32x32, 9~24 for 16x16.
+ uint8_t variance_low[25];
+
+ uint8_t arf_frame_usage;
+ uint8_t lastgolden_frame_usage;
+
+ void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+ void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+ void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
+ int stride, int eob, int bd);
+#endif
+ DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]);
+
+ struct scale_factors *me_sf;
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_BLOCK_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c
new file mode 100644
index 0000000000..da68a3c3c3
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_blockiness.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1] * s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+ const uint8_t *img2, int img2_pitch, int width,
+ int height) {
+ double blockiness = 0;
+ int i, j;
+ vpx_clear_system_state();
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness +=
+ blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+ img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h
new file mode 100644
index 0000000000..e840cb2518
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_blockiness.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+ const uint8_t *img2, int img2_pitch, int width,
+ int height);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
new file mode 100644
index 0000000000..42073f756c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+static const BLOCK_SIZE square[] = {
+ BLOCK_8X8,
+ BLOCK_16X16,
+ BLOCK_32X32,
+ BLOCK_64X64,
+};
+
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+ PICK_MODE_CONTEXT *ctx) {
+ const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
+ const int num_pix = num_blk << 4;
+ int i, k;
+ ctx->num_4x4_blk = num_blk;
+
+ CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+ vpx_calloc(num_blk, sizeof(uint8_t)));
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ for (k = 0; k < 3; ++k) {
+ CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
+ vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
+ CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
+ vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
+ CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
+ vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
+ CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
+ vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
+ ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
+ ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
+ ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+ ctx->eobs_pbuf[i][k] = ctx->eobs[i][k];
+ }
+ }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+ int i, k;
+ vpx_free(ctx->zcoeff_blk);
+ ctx->zcoeff_blk = 0;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ for (k = 0; k < 3; ++k) {
+ vpx_free(ctx->coeff[i][k]);
+ ctx->coeff[i][k] = 0;
+ vpx_free(ctx->qcoeff[i][k]);
+ ctx->qcoeff[i][k] = 0;
+ vpx_free(ctx->dqcoeff[i][k]);
+ ctx->dqcoeff[i][k] = 0;
+ vpx_free(ctx->eobs[i][k]);
+ ctx->eobs[i][k] = 0;
+ }
+ }
+}
+
+static void alloc_tree_contexts(VP9_COMMON *cm, PC_TREE *tree,
+ int num_4x4_blk) {
+ alloc_mode_context(cm, num_4x4_blk, &tree->none);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
+
+ if (num_4x4_blk > 4) {
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]);
+ alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[1]);
+ } else {
+ memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
+ memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
+ }
+}
+
+static void free_tree_contexts(PC_TREE *tree) {
+ free_mode_context(&tree->none);
+ free_mode_context(&tree->horizontal[0]);
+ free_mode_context(&tree->horizontal[1]);
+ free_mode_context(&tree->vertical[0]);
+ free_mode_context(&tree->vertical[1]);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split. Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
+ int i, j;
+ const int leaf_nodes = 64;
+ const int tree_nodes = 64 + 16 + 4 + 1;
+ int pc_tree_index = 0;
+ PC_TREE *this_pc;
+ PICK_MODE_CONTEXT *this_leaf;
+ int square_index = 1;
+ int nodes;
+
+ vpx_free(td->leaf_tree);
+ CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
+ vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
+ vpx_free(td->pc_tree);
+ CHECK_MEM_ERROR(&cm->error, td->pc_tree,
+ vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
+
+ this_pc = &td->pc_tree[0];
+ this_leaf = &td->leaf_tree[0];
+
+ // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
+ // context so we only need to allocate 1 for each 8x8 block.
+ for (i = 0; i < leaf_nodes; ++i) alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+
+ // Sets up all the leaf nodes in the tree.
+ for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ tree->block_size = square[0];
+ alloc_tree_contexts(cm, tree, 4);
+ tree->leaf_split[0] = this_leaf++;
+ for (j = 1; j < 4; j++) tree->leaf_split[j] = tree->leaf_split[0];
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = 16; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
+ tree->block_size = square[square_index];
+ for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ ++pc_tree_index;
+ }
+ ++square_index;
+ }
+ td->pc_root = &td->pc_tree[tree_nodes - 1];
+ td->pc_root[0].none.best_mode_index = 2;
+}
+
+void vp9_free_pc_tree(ThreadData *td) {
+ int i;
+
+ if (td == NULL) return;
+
+ if (td->leaf_tree != NULL) {
+ // Set up all 4x4 mode contexts
+ for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+ vpx_free(td->leaf_tree);
+ td->leaf_tree = NULL;
+ }
+
+ if (td->pc_tree != NULL) {
+ const int tree_nodes = 64 + 16 + 4 + 1;
+ // Sets up all the leaf nodes in the tree.
+ for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+ vpx_free(td->pc_tree);
+ td->pc_tree = NULL;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
new file mode 100644
index 0000000000..4e301cc17d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_context_tree.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+struct VP9Common;
+struct ThreadData;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+ MODE_INFO mic;
+ MB_MODE_INFO_EXT mbmi_ext;
+ uint8_t *zcoeff_blk;
+ tran_low_t *coeff[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff[MAX_MB_PLANE][3];
+ uint16_t *eobs[MAX_MB_PLANE][3];
+
+ // dual buffer pointers, 0: in use, 1: best in store
+ tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+ uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+ int is_coded;
+ int num_4x4_blk;
+ int skip;
+ int pred_pixel_ready;
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 0.
+ int skippable;
+ uint8_t skip_txfm[MAX_MB_PLANE << 2];
+ int best_mode_index;
+ int hybrid_pred_diff;
+ int comp_pred_diff;
+ int single_pred_diff;
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+
+ // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+ // scope of refactoring.
+ int rate;
+ int64_t dist;
+ int64_t rdcost;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ unsigned int newmv_sse;
+ unsigned int zeromv_sse;
+ unsigned int zeromv_lastref_sse;
+ PREDICTION_MODE best_sse_inter_mode;
+ int_mv best_sse_mv;
+ MV_REFERENCE_FRAME best_reference_frame;
+ MV_REFERENCE_FRAME best_zeromv_reference_frame;
+ int sb_skip_denoising;
+#endif
+
+ // motion vector cache for adaptive motion search control in partition
+ // search loop
+ MV pred_mv[MAX_REF_FRAMES];
+ INTERP_FILTER pred_interp_filter;
+
+ // Used for the machine learning-based early termination
+ int32_t sum_y_eobs;
+ // Skip certain ref frames during RD search of rectangular partitions.
+ uint8_t skip_ref_frame_mask;
+} PICK_MODE_CONTEXT;
+
+typedef struct PC_TREE {
+ int index;
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT none;
+ PICK_MODE_CONTEXT horizontal[2];
+ PICK_MODE_CONTEXT vertical[2];
+ union {
+ struct PC_TREE *split[4];
+ PICK_MODE_CONTEXT *leaf_split[4];
+ };
+ // Obtained from a simple motion search. Used by the ML based partition search
+ // speed feature.
+ MV mv;
+} PC_TREE;
+
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.c b/media/libvpx/libvpx/vp9/encoder/vp9_cost.c
new file mode 100644
index 0000000000..81581a80c2
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+
+#include "vp9/encoder/vp9_cost.h"
+
+/* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT))
+ Begins with a bogus entry for simpler addressing. */
+const uint16_t vp9_prob_cost[256] = {
+ 4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
+ 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
+ 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
+ 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252, 1236, 1221, 1206, 1192,
+ 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084, 1072, 1059, 1047, 1036, 1024,
+ 1013, 1001, 990, 979, 968, 958, 947, 937, 927, 917, 907, 897, 887,
+ 878, 868, 859, 850, 841, 832, 823, 814, 806, 797, 789, 780, 772,
+ 764, 756, 748, 740, 732, 724, 717, 709, 702, 694, 687, 680, 673,
+ 665, 658, 651, 644, 637, 631, 624, 617, 611, 604, 598, 591, 585,
+ 578, 572, 566, 560, 554, 547, 541, 535, 530, 524, 518, 512, 506,
+ 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+ 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371,
+ 366, 361, 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311,
+ 307, 302, 298, 294, 289, 285, 281, 277, 273, 268, 264, 260, 256,
+ 252, 248, 244, 240, 236, 232, 228, 224, 220, 216, 212, 209, 205,
+ 201, 197, 194, 190, 186, 182, 179, 175, 171, 168, 164, 161, 157,
+ 153, 150, 146, 143, 139, 136, 132, 129, 125, 122, 119, 115, 112,
+ 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73, 70,
+ 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29,
+ 26, 23, 20, 18, 15, 12, 9, 6, 3
+};
+
+static void cost(int *costs, vpx_tree tree, const vpx_prob *probs, int i,
+ int c) {
+ const vpx_prob prob = probs[i / 2];
+ int b;
+
+ assert(prob != 0);
+ for (b = 0; b <= 1; ++b) {
+ const int cc = c + vp9_cost_bit(prob, b);
+ const vpx_tree_index ii = tree[i + b];
+
+ if (ii <= 0)
+ costs[-ii] = cc;
+ else
+ cost(costs, tree, probs, ii, cc);
+ }
+}
+
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
+ cost(costs, tree, probs, 0, 0);
+}
+
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree) {
+ assert(tree[0] <= 0 && tree[1] > 0);
+
+ costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
+ cost(costs, tree, probs, 2, 0);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_cost.h b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h
new file mode 100644
index 0000000000..ee0033fa31
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_cost.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_COST_H_
+#define VPX_VP9_ENCODER_VP9_COST_H_
+
+#include "vpx_dsp/prob.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t vp9_prob_cost[256];
+
+// The factor to scale from cost in bits to cost in vp9_prob_cost units.
+#define VP9_PROB_COST_SHIFT 9
+
+#define vp9_cost_zero(prob) (vp9_prob_cost[prob])
+
+#define vp9_cost_one(prob) vp9_cost_zero(256 - (prob))
+
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob))
+
+static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) {
+ return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p);
+}
+
+static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits,
+ int len) {
+ int cost = 0;
+ vpx_tree_index i = 0;
+
+ do {
+ const int bit = (bits >> --len) & 1;
+ cost += vp9_cost_bit(probs[i >> 1], bit);
+ i = tree[i + bit];
+ } while (len);
+
+ return cost;
+}
+
+void vp9_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_COST_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_dct.c b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c
new file mode 100644
index 0000000000..2f42c6afc2
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_dct.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/fwd_txfm.h"
+#include "vpx_ports/mem.h"
+
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t step[4];
+ tran_high_t temp1, temp2;
+
+ step[0] = input[0] + input[3];
+ step[1] = input[1] + input[2];
+ step[2] = input[1] - input[2];
+ step[3] = input[0] - input[3];
+
+ temp1 = (step[0] + step[1]) * cospi_16_64;
+ temp2 = (step[0] - step[1]) * cospi_16_64;
+ output[0] = (tran_low_t)fdct_round_shift(temp1);
+ output[2] = (tran_low_t)fdct_round_shift(temp2);
+ temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
+ temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
+ output[1] = (tran_low_t)fdct_round_shift(temp1);
+ output[3] = (tran_low_t)fdct_round_shift(temp2);
+}
+
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ // stage 1
+ s0 = input[0] + input[7];
+ s1 = input[1] + input[6];
+ s2 = input[2] + input[5];
+ s3 = input[3] + input[4];
+ s4 = input[3] - input[4];
+ s5 = input[2] - input[5];
+ s6 = input[1] - input[6];
+ s7 = input[0] - input[7];
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+ t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+ output[0] = (tran_low_t)fdct_round_shift(t0);
+ output[2] = (tran_low_t)fdct_round_shift(t2);
+ output[4] = (tran_low_t)fdct_round_shift(t1);
+ output[6] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = (tran_low_t)fdct_round_shift(t0);
+ t3 = (tran_low_t)fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ output[1] = (tran_low_t)fdct_round_shift(t0);
+ output[3] = (tran_low_t)fdct_round_shift(t2);
+ output[5] = (tran_low_t)fdct_round_shift(t1);
+ output[7] = (tran_low_t)fdct_round_shift(t3);
+}
+
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
+
+ // step 1
+ input[0] = in[0] + in[15];
+ input[1] = in[1] + in[14];
+ input[2] = in[2] + in[13];
+ input[3] = in[3] + in[12];
+ input[4] = in[4] + in[11];
+ input[5] = in[5] + in[10];
+ input[6] = in[6] + in[9];
+ input[7] = in[7] + in[8];
+
+ step1[0] = in[7] - in[8];
+ step1[1] = in[6] - in[9];
+ step1[2] = in[5] - in[10];
+ step1[3] = in[4] - in[11];
+ step1[4] = in[3] - in[12];
+ step1[5] = in[2] - in[13];
+ step1[6] = in[1] - in[14];
+ step1[7] = in[0] - in[15];
+
+ // fdct8(step, step);
+ {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
+
+ // stage 1
+ s0 = input[0] + input[7];
+ s1 = input[1] + input[6];
+ s2 = input[2] + input[5];
+ s3 = input[3] + input[4];
+ s4 = input[3] - input[4];
+ s5 = input[2] - input[5];
+ s6 = input[1] - input[6];
+ s7 = input[0] - input[7];
+
+ // fdct4(step, step);
+ x0 = s0 + s3;
+ x1 = s1 + s2;
+ x2 = s1 - s2;
+ x3 = s0 - s3;
+ t0 = (x0 + x1) * cospi_16_64;
+ t1 = (x0 - x1) * cospi_16_64;
+ t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
+ t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[4] = (tran_low_t)fdct_round_shift(t2);
+ out[8] = (tran_low_t)fdct_round_shift(t1);
+ out[12] = (tran_low_t)fdct_round_shift(t3);
+
+ // Stage 2
+ t0 = (s6 - s5) * cospi_16_64;
+ t1 = (s6 + s5) * cospi_16_64;
+ t2 = fdct_round_shift(t0);
+ t3 = fdct_round_shift(t1);
+
+ // Stage 3
+ x0 = s4 + t2;
+ x1 = s4 - t2;
+ x2 = s7 - t3;
+ x3 = s7 + t3;
+
+ // Stage 4
+ t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+ t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+ t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+ t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+ out[2] = (tran_low_t)fdct_round_shift(t0);
+ out[6] = (tran_low_t)fdct_round_shift(t2);
+ out[10] = (tran_low_t)fdct_round_shift(t1);
+ out[14] = (tran_low_t)fdct_round_shift(t3);
+ }
+
+ // step 2
+ temp1 = (step1[5] - step1[2]) * cospi_16_64;
+ temp2 = (step1[4] - step1[3]) * cospi_16_64;
+ step2[2] = fdct_round_shift(temp1);
+ step2[3] = fdct_round_shift(temp2);
+ temp1 = (step1[4] + step1[3]) * cospi_16_64;
+ temp2 = (step1[5] + step1[2]) * cospi_16_64;
+ step2[4] = fdct_round_shift(temp1);
+ step2[5] = fdct_round_shift(temp2);
+
+ // step 3
+ step3[0] = step1[0] + step2[3];
+ step3[1] = step1[1] + step2[2];
+ step3[2] = step1[1] - step2[2];
+ step3[3] = step1[0] - step2[3];
+ step3[4] = step1[7] - step2[4];
+ step3[5] = step1[6] - step2[5];
+ step3[6] = step1[6] + step2[5];
+ step3[7] = step1[7] + step2[4];
+
+ // step 4
+ temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
+ temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
+ step2[1] = fdct_round_shift(temp1);
+ step2[2] = fdct_round_shift(temp2);
+ temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
+ temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
+ step2[5] = fdct_round_shift(temp1);
+ step2[6] = fdct_round_shift(temp2);
+
+ // step 5
+ step1[0] = step3[0] + step2[1];
+ step1[1] = step3[0] - step2[1];
+ step1[2] = step3[3] + step2[2];
+ step1[3] = step3[3] - step2[2];
+ step1[4] = step3[4] - step2[5];
+ step1[5] = step3[4] + step2[5];
+ step1[6] = step3[7] - step2[6];
+ step1[7] = step3[7] + step2[6];
+
+ // step 6
+ temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+ temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+ out[1] = (tran_low_t)fdct_round_shift(temp1);
+ out[9] = (tran_low_t)fdct_round_shift(temp2);
+
+ temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+ temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+ out[5] = (tran_low_t)fdct_round_shift(temp1);
+ out[13] = (tran_low_t)fdct_round_shift(temp2);
+
+ temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+ temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+ out[3] = (tran_low_t)fdct_round_shift(temp1);
+ out[11] = (tran_low_t)fdct_round_shift(temp2);
+
+ temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+ temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+ out[7] = (tran_low_t)fdct_round_shift(temp1);
+ out[15] = (tran_low_t)fdct_round_shift(temp2);
+}
+
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_4_9 * x0;
+ s2 = sinpi_2_9 * x1;
+ s3 = sinpi_1_9 * x1;
+ s4 = sinpi_3_9 * x2;
+ s5 = sinpi_4_9 * x3;
+ s6 = sinpi_2_9 * x3;
+ s7 = x0 + x1 - x3;
+
+ x0 = s0 + s2 + s5;
+ x1 = sinpi_3_9 * s7;
+ x2 = s1 - s3 + s6;
+ x3 = s4;
+
+ s0 = x0 + x3;
+ s1 = x1;
+ s2 = x2 - x3;
+ s3 = x2 - x0 + x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = (tran_low_t)fdct_round_shift(s0);
+ output[1] = (tran_low_t)fdct_round_shift(s1);
+ output[2] = (tran_low_t)fdct_round_shift(s2);
+ output[3] = (tran_low_t)fdct_round_shift(s3);
+}
+
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = fdct_round_shift(s0 + s4);
+ x1 = fdct_round_shift(s1 + s5);
+ x2 = fdct_round_shift(s2 + s6);
+ x3 = fdct_round_shift(s3 + s7);
+ x4 = fdct_round_shift(s0 - s4);
+ x5 = fdct_round_shift(s1 - s5);
+ x6 = fdct_round_shift(s2 - s6);
+ x7 = fdct_round_shift(s3 - s7);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = fdct_round_shift(s4 + s6);
+ x5 = fdct_round_shift(s5 + s7);
+ x6 = fdct_round_shift(s4 - s6);
+ x7 = fdct_round_shift(s5 - s7);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = fdct_round_shift(s2);
+ x3 = fdct_round_shift(s3);
+ x6 = fdct_round_shift(s6);
+ x7 = fdct_round_shift(s7);
+
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x4;
+ output[2] = (tran_low_t)x6;
+ output[3] = (tran_low_t)-x2;
+ output[4] = (tran_low_t)x3;
+ output[5] = (tran_low_t)-x7;
+ output[6] = (tran_low_t)x5;
+ output[7] = (tran_low_t)-x1;
+}
+
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = fdct_round_shift(s0 + s8);
+ x1 = fdct_round_shift(s1 + s9);
+ x2 = fdct_round_shift(s2 + s10);
+ x3 = fdct_round_shift(s3 + s11);
+ x4 = fdct_round_shift(s4 + s12);
+ x5 = fdct_round_shift(s5 + s13);
+ x6 = fdct_round_shift(s6 + s14);
+ x7 = fdct_round_shift(s7 + s15);
+ x8 = fdct_round_shift(s0 - s8);
+ x9 = fdct_round_shift(s1 - s9);
+ x10 = fdct_round_shift(s2 - s10);
+ x11 = fdct_round_shift(s3 - s11);
+ x12 = fdct_round_shift(s4 - s12);
+ x13 = fdct_round_shift(s5 - s13);
+ x14 = fdct_round_shift(s6 - s14);
+ x15 = fdct_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = fdct_round_shift(s8 + s12);
+ x9 = fdct_round_shift(s9 + s13);
+ x10 = fdct_round_shift(s10 + s14);
+ x11 = fdct_round_shift(s11 + s15);
+ x12 = fdct_round_shift(s8 - s12);
+ x13 = fdct_round_shift(s9 - s13);
+ x14 = fdct_round_shift(s10 - s14);
+ x15 = fdct_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = fdct_round_shift(s4 + s6);
+ x5 = fdct_round_shift(s5 + s7);
+ x6 = fdct_round_shift(s4 - s6);
+ x7 = fdct_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = fdct_round_shift(s12 + s14);
+ x13 = fdct_round_shift(s13 + s15);
+ x14 = fdct_round_shift(s12 - s14);
+ x15 = fdct_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = fdct_round_shift(s2);
+ x3 = fdct_round_shift(s3);
+ x6 = fdct_round_shift(s6);
+ x7 = fdct_round_shift(s7);
+ x10 = fdct_round_shift(s10);
+ x11 = fdct_round_shift(s11);
+ x14 = fdct_round_shift(s14);
+ x15 = fdct_round_shift(s15);
+
+ output[0] = (tran_low_t)x0;
+ output[1] = (tran_low_t)-x8;
+ output[2] = (tran_low_t)x12;
+ output[3] = (tran_low_t)-x4;
+ output[4] = (tran_low_t)x6;
+ output[5] = (tran_low_t)x14;
+ output[6] = (tran_low_t)x10;
+ output[7] = (tran_low_t)x2;
+ output[8] = (tran_low_t)x3;
+ output[9] = (tran_low_t)x11;
+ output[10] = (tran_low_t)x15;
+ output[11] = (tran_low_t)x7;
+ output[12] = (tran_low_t)x5;
+ output[13] = (tran_low_t)-x13;
+ output[14] = (tran_low_t)x9;
+ output[15] = (tran_low_t)-x1;
+}
+
+static const transform_2d FHT_4[] = {
+ { fdct4, fdct4 }, // DCT_DCT = 0
+ { fadst4, fdct4 }, // ADST_DCT = 1
+ { fdct4, fadst4 }, // DCT_ADST = 2
+ { fadst4, fadst4 } // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+ { fdct8, fdct8 }, // DCT_DCT = 0
+ { fadst8, fdct8 }, // ADST_DCT = 1
+ { fdct8, fadst8 }, // DCT_ADST = 2
+ { fadst8, fadst8 } // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+ { fdct16, fdct16 }, // DCT_DCT = 0
+ { fadst16, fdct16 }, // ADST_DCT = 1
+ { fdct16, fadst16 }, // DCT_ADST = 2
+ { fadst16, fadst16 } // ADST_ADST = 3
+};
+
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vpx_fdct4x4_c(input, output, stride);
+ } else {
+ tran_low_t out[4 * 4];
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ const transform_2d ht = FHT_4[tx_type];
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
+ if (i == 0 && temp_in[0]) temp_in[0] += 1;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
+ }
+ }
+}
+
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vpx_fdct8x8_c(input, output, stride);
+ } else {
+ tran_low_t out[64];
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ const transform_2d ht = FHT_8[tx_type];
+
+ // Columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
+ }
+
+ // Rows
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 8; ++j)
+ output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+ }
+ }
+}
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+ pixel. */
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
+
+ a1 += b1;
+ d1 = d1 - c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)a1;
+ op[4] = (tran_low_t)c1;
+ op[8] = (tran_low_t)d1;
+ op[12] = (tran_low_t)b1;
+
+ ip_pass0++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0];
+ b1 = ip[1];
+ c1 = ip[2];
+ d1 = ip[3];
+
+ a1 += b1;
+ d1 -= c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ if (tx_type == DCT_DCT) {
+ vpx_fdct16x16_c(input, output, stride);
+ } else {
+ tran_low_t out[256];
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ const transform_2d ht = FHT_16[tx_type];
+
+ // Columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
+ ht.cols(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
+ ht.rows(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ vp9_fht16x16_c(input, output, stride, tx_type);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c
new file mode 100644
index 0000000000..baea8ebb3c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
+#endif
+
+static int absdiff_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ return 3 + (increase_denoising ? 1 : 0);
+}
+
+static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 4;
+}
+
+static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ (void)bs;
+ (void)increase_denoising;
+ return 625;
+}
+
+static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 80 : 40);
+}
+
+static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
+ int motion_magnitude) {
+ if (motion_magnitude > noise_motion_thresh(bs, increase_denoising)) {
+ if (increase_denoising)
+ return (1 << num_pels_log2_lookup[bs]) << 2;
+ else
+ return 0;
+ } else {
+ return (1 << num_pels_log2_lookup[bs]) << 4;
+ }
+}
+
+static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+
+// TODO(jackychen): If increase_denoising is enabled in the future,
+// we might need to update the code for calculating 'total_adj' in
+// case the C code is not bit-exact with corresponding sse2 code.
+int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride, int increase_denoising,
+ BLOCK_SIZE bs, int motion_magnitude) {
+ int r, c;
+ const uint8_t *sig_start = sig;
+ const uint8_t *mc_avg_start = mc_avg;
+ uint8_t *avg_start = avg;
+ int diff, adj, absdiff, delta;
+ int adj_val[] = { 3, 4, 6 };
+ int total_adj = 0;
+ int shift_inc = 1;
+
+ // If motion_magnitude is small, making the denoiser more aggressive by
+ // increasing the adjustment for each level. Add another increment for
+ // blocks that are labeled for increase denoising.
+ if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
+ if (increase_denoising) {
+ shift_inc = 2;
+ }
+ adj_val[0] += shift_inc;
+ adj_val[1] += shift_inc;
+ adj_val[2] += shift_inc;
+ }
+
+ // First attempt to apply a strong temporal denoising filter.
+ for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+ for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+ diff = mc_avg[c] - sig[c];
+ absdiff = abs(diff);
+
+ if (absdiff <= absdiff_thresh(bs, increase_denoising)) {
+ avg[c] = mc_avg[c];
+ total_adj += diff;
+ } else {
+ switch (absdiff) {
+ case 4:
+ case 5:
+ case 6:
+ case 7: adj = adj_val[0]; break;
+ case 8:
+ case 9:
+ case 10:
+ case 11:
+ case 12:
+ case 13:
+ case 14:
+ case 15: adj = adj_val[1]; break;
+ default: adj = adj_val[2];
+ }
+ if (diff > 0) {
+ avg[c] = VPXMIN(UINT8_MAX, sig[c] + adj);
+ total_adj += adj;
+ } else {
+ avg[c] = VPXMAX(0, sig[c] - adj);
+ total_adj -= adj;
+ }
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // If the strong filter did not modify the signal too much, we're all set.
+ if (abs(total_adj) <= total_adj_strong_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+
+ // Otherwise, we try to dampen the filter if the delta is not too high.
+ delta = ((abs(total_adj) - total_adj_strong_thresh(bs, increase_denoising)) >>
+ num_pels_log2_lookup[bs]) +
+ 1;
+
+ if (delta >= delta_thresh(bs, increase_denoising)) {
+ return COPY_BLOCK;
+ }
+
+ mc_avg = mc_avg_start;
+ avg = avg_start;
+ sig = sig_start;
+ for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
+ for (c = 0; c < (4 << b_width_log2_lookup[bs]); ++c) {
+ diff = mc_avg[c] - sig[c];
+ adj = abs(diff);
+ if (adj > delta) {
+ adj = delta;
+ }
+ if (diff > 0) {
+ // Diff positive means we made positive adjustment above
+ // (in first try/attempt), so now make negative adjustment to bring
+ // denoised signal down.
+ avg[c] = VPXMAX(0, avg[c] - adj);
+ total_adj -= adj;
+ } else {
+ // Diff negative means we made negative adjustment above
+ // (in first try/attempt), so now make positive adjustment to bring
+ // denoised signal up.
+ avg[c] = VPXMIN(UINT8_MAX, avg[c] + adj);
+ total_adj += adj;
+ }
+ }
+ sig += sig_stride;
+ avg += avg_stride;
+ mc_avg += mc_avg_stride;
+ }
+
+ // We can use the filter if it has been sufficiently dampened
+ if (abs(total_adj) <= total_adj_weak_thresh(bs, increase_denoising)) {
+ return FILTER_BLOCK;
+ }
+ return COPY_BLOCK;
+}
+
+static uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row,
+ int mi_col) {
+ return framebuf + (stride * mi_row << 3) + (mi_col << 3);
+}
+
+static VP9_DENOISER_DECISION perform_motion_compensation(
+ VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
+ int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
+ int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
+ int use_svc, int spatial_layer, int use_gf_temporal_ref) {
+ const int sse_diff = (ctx->newmv_sse == UINT_MAX)
+ ? 0
+ : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
+ int frame;
+ int denoise_layer_idx = 0;
+ MACROBLOCKD *filter_mbd = &mb->e_mbd;
+ MODE_INFO *mi = filter_mbd->mi[0];
+ MODE_INFO saved_mi;
+ int i;
+ struct buf_2d saved_dst[MAX_MB_PLANE];
+ struct buf_2d saved_pre[MAX_MB_PLANE];
+ const RefBuffer *saved_block_refs[2];
+ MV_REFERENCE_FRAME saved_frame;
+
+ frame = ctx->best_reference_frame;
+
+ saved_mi = *mi;
+
+ if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK;
+
+ // Avoid denoising small blocks. When noise > kDenLow or frame width > 480,
+ // denoise 16x16 blocks.
+ if (bs == BLOCK_8X8 || bs == BLOCK_8X16 || bs == BLOCK_16X8 ||
+ (bs == BLOCK_16X16 && width > 480 &&
+ denoiser->denoising_level <= kDenLow))
+ return COPY_BLOCK;
+
+ // If the best reference frame uses inter-prediction and there is enough of a
+ // difference in sum-squared-error, use it.
+ if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
+ sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
+ mi->ref_frame[0] = ctx->best_reference_frame;
+ mi->mode = ctx->best_sse_inter_mode;
+ mi->mv[0] = ctx->best_sse_mv;
+ } else {
+ // Otherwise, use the zero reference frame.
+ frame = ctx->best_zeromv_reference_frame;
+ ctx->newmv_sse = ctx->zeromv_sse;
+ // Bias to last reference.
+ if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+ frame == ALTREF_FRAME ||
+ (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
+ (frame != LAST_FRAME &&
+ ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
+ denoiser->denoising_level >= kDenHigh))) {
+ frame = LAST_FRAME;
+ ctx->newmv_sse = ctx->zeromv_lastref_sse;
+ }
+ mi->ref_frame[0] = frame;
+ mi->mode = ZEROMV;
+ mi->mv[0].as_int = 0;
+ ctx->best_sse_inter_mode = ZEROMV;
+ ctx->best_sse_mv.as_int = 0;
+ *zeromv_filter = 1;
+ if (denoiser->denoising_level > kDenMedium) {
+ motion_magnitude = 0;
+ }
+ }
+
+ saved_frame = frame;
+ // When using SVC, we need to map REF_FRAME to the frame buffer index.
+ if (use_svc) {
+ if (frame == LAST_FRAME)
+ frame = lst_fb_idx + 1;
+ else if (frame == GOLDEN_FRAME)
+ frame = gld_fb_idx + 1;
+ // Shift for the second spatial layer.
+ if (num_spatial_layers - spatial_layer == 2)
+ frame = frame + denoiser->num_ref_frames;
+ denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+ }
+
+ // Force copy (no denoise, copy source in denoised buffer) if
+ // running_avg_y[frame] is NULL.
+ if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+ if (motion_magnitude > (noise_motion_thresh(bs, increase_denoising) << 3)) {
+ // Restore everything to its original state
+ *mi = saved_mi;
+ return COPY_BLOCK;
+ }
+
+ // We will restore these after motion compensation.
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ saved_pre[i] = filter_mbd->plane[i].pre[0];
+ saved_dst[i] = filter_mbd->plane[i].dst;
+ }
+ saved_block_refs[0] = filter_mbd->block_refs[0];
+
+ // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
+ // struct.
+ filter_mbd->plane[0].pre[0].buf =
+ block_start(denoiser->running_avg_y[frame].y_buffer,
+ denoiser->running_avg_y[frame].y_stride, mi_row, mi_col);
+ filter_mbd->plane[0].pre[0].stride = denoiser->running_avg_y[frame].y_stride;
+ filter_mbd->plane[1].pre[0].buf =
+ block_start(denoiser->running_avg_y[frame].u_buffer,
+ denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[1].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
+ filter_mbd->plane[2].pre[0].buf =
+ block_start(denoiser->running_avg_y[frame].v_buffer,
+ denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
+
+ filter_mbd->plane[0].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+ filter_mbd->plane[0].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+ filter_mbd->plane[1].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[1].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+ filter_mbd->plane[2].dst.buf = block_start(
+ denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+ filter_mbd->plane[2].dst.stride =
+ denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+
+ set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+ vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
+
+ // Restore everything to its original state
+ *mi = saved_mi;
+ filter_mbd->block_refs[0] = saved_block_refs[0];
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ filter_mbd->plane[i].pre[0] = saved_pre[i];
+ filter_mbd->plane[i].dst = saved_dst[i];
+ }
+
+ return FILTER_BLOCK;
+}
+
+void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
+ BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ VP9_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref) {
+ int mv_col, mv_row;
+ int motion_magnitude = 0;
+ int zeromv_filter = 0;
+ VP9_DENOISER *denoiser = &cpi->denoiser;
+ VP9_DENOISER_DECISION decision = COPY_BLOCK;
+
+ const int shift =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+ ? denoiser->num_ref_frames
+ : 0;
+ YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+ const int denoise_layer_index =
+ cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+ YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
+ uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
+ uint8_t *mc_avg_start =
+ block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
+ struct buf_2d src = mb->plane[0].src;
+ int is_skin = 0;
+ int increase_denoising = 0;
+ int consec_zeromv = 0;
+ int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG;
+ mv_col = ctx->best_sse_mv.as_mv.col;
+ mv_row = ctx->best_sse_mv.as_mv.row;
+ motion_magnitude = mv_row * mv_row + mv_col * mv_col;
+
+ if (cpi->use_skin_detection && bs <= BLOCK_32X32 &&
+ denoiser->denoising_level < kDenHigh) {
+ int motion_level = (motion_magnitude < 16) ? 0 : 1;
+ // If motion for current block is small/zero, compute consec_zeromv for
+ // skin detection (early exit in skin detection is done for large
+ // consec_zeromv when current block has small/zero motion).
+ consec_zeromv = 0;
+ if (motion_level == 0) {
+ VP9_COMMON *const cm = &cpi->common;
+ int j, i;
+ // Loop through the 8x8 sub-blocks.
+ const int bw = num_8x8_blocks_wide_lookup[bs];
+ const int bh = num_8x8_blocks_high_lookup[bs];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ consec_zeromv = 100;
+ for (i = 0; i < ymis; i++) {
+ for (j = 0; j < xmis; j++) {
+ int bl_index = block_index + i * cm->mi_cols + j;
+ consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
+ // No need to keep checking 8x8 blocks if any of the sub-blocks
+ // has small consec_zeromv (since threshold for no_skin based on
+ // zero/small motion in skin detection is high, i.e, > 4).
+ if (consec_zeromv < 4) {
+ i = ymis;
+ break;
+ }
+ }
+ }
+ }
+ // TODO(marpan): Compute skin detection over sub-blocks.
+ is_skin = vp9_compute_skin_block(
+ mb->plane[0].src.buf, mb->plane[1].src.buf, mb->plane[2].src.buf,
+ mb->plane[0].src.stride, mb->plane[1].src.stride, bs, consec_zeromv,
+ motion_level);
+ }
+ if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
+
+ // Copy block if LAST_FRAME is not a reference.
+ // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+ // spatial layer doesn't have last reference when it's brought up for the
+ // first time on the fly.
+ if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+ !ctx->sb_skip_denoising)
+ decision = perform_motion_compensation(
+ &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
+ motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
+ cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
+ cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id,
+ use_gf_temporal_ref);
+
+ if (decision == FILTER_BLOCK) {
+ decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
+ mc_avg.y_stride, avg_start, avg.y_stride,
+ increase_denoising, bs, motion_magnitude);
+ }
+
+ if (decision == FILTER_BLOCK) {
+ vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+ 0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
+ } else { // COPY_BLOCK
+ vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+ 0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+ num_4x4_blocks_high_lookup[bs] << 2);
+ }
+ *denoiser_decision = decision;
+ if (decision == FILTER_BLOCK && zeromv_filter == 1)
+ *denoiser_decision = FILTER_ZEROMV_BLOCK;
+}
+
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ int r;
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
+ YV12_BUFFER_CONFIG *const src) {
+ uint8_t *tmp_buf = dest->y_buffer;
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+ dest->y_buffer = src->y_buffer;
+ src->y_buffer = tmp_buf;
+}
+
+void vp9_denoiser_update_frame_info(
+ VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+ FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+ int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+ int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) {
+ const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
+ // Copy source into denoised reference buffers on KEY_FRAME or
+ // if the just encoded frame was resized. For SVC, copy source if the base
+ // spatial layer was key frame.
+ if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
+ svc_refresh_denoiser_buffers) {
+ int i;
+ // Start at 1 so as not to overwrite the INTRA_FRAME
+ for (i = 1; i < denoiser->num_ref_frames; ++i) {
+ if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+ copy_frame(&denoiser->running_avg_y[i + shift], &src);
+ }
+ denoiser->reset = 0;
+ return;
+ }
+
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->use_set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i))
+ copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ // If more than one refresh occurs, must copy frame buffer.
+ if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+ 1) {
+ if (refresh_alt_ref_frame) {
+ copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ } else {
+ if (refresh_alt_ref_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_golden_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ if (refresh_last_frame) {
+ swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+ &denoiser->running_avg_y[INTRA_FRAME + shift]);
+ }
+ }
+ }
+}
+
+void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
+ ctx->zeromv_sse = UINT_MAX;
+ ctx->newmv_sse = UINT_MAX;
+ ctx->zeromv_lastref_sse = UINT_MAX;
+ ctx->best_sse_mv.as_int = 0;
+}
+
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx) {
+ if (mi->mv[0].as_int == 0 && sse < ctx->zeromv_sse) {
+ ctx->zeromv_sse = sse;
+ ctx->best_zeromv_reference_frame = mi->ref_frame[0];
+ if (mi->ref_frame[0] == LAST_FRAME) ctx->zeromv_lastref_sse = sse;
+ }
+
+ if (mi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
+ ctx->newmv_sse = sse;
+ ctx->best_sse_inter_mode = mode;
+ ctx->best_sse_mv = mi->mv[0];
+ ctx->best_reference_frame = mi->ref_frame[0];
+ }
+}
+
+static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
+ VP9_DENOISER *denoiser, int fb_idx) {
+ int fail = 0;
+ if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+ fail =
+ vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width,
+ cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, 0);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+ struct SVC *svc, int svc_buf_shift,
+ int refresh_alt, int refresh_gld, int refresh_lst,
+ int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
+ int fail = 0;
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->use_set_ref_frame_config) {
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if (cm->frame_type == KEY_FRAME ||
+ svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ i + 1 + svc_buf_shift);
+ }
+ }
+ } else {
+ if (refresh_alt) {
+ // Increase the frame buffer index by 1 to map it to the buffer index in
+ // the denoiser.
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ alt_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_gld) {
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ gld_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ if (refresh_lst) {
+ fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+ lst_fb_idx + 1 + svc_buf_shift);
+ if (fail) return 1;
+ }
+ }
+ return 0;
+}
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border) {
+ int i, layer, fail, init_num_ref_frames;
+ const int legacy_byte_alignment = 0;
+ int num_layers = 1;
+ int scaled_width = width;
+ int scaled_height = height;
+ if (use_svc) {
+ LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ get_layer_resolution(width, height, lc->scaling_factor_num,
+ lc->scaling_factor_den, &scaled_width, &scaled_height);
+ // For SVC: only denoise at most 2 spatial (highest) layers.
+ if (noise_sen >= 2)
+ // Denoise from one spatial layer below the top.
+ svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+ else
+ // Only denoise the top spatial layer.
+ svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+ num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+ }
+ assert(denoiser != NULL);
+ denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+ init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+ denoiser->num_layers = num_layers;
+ CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
+ vpx_calloc(denoiser->num_ref_frames * num_layers,
+ sizeof(denoiser->running_avg_y[0])));
+ CHECK_MEM_ERROR(
+ &cm->error, denoiser->mc_running_avg_y,
+ vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+ for (layer = 0; layer < num_layers; ++layer) {
+ const int denoise_width = (layer == 0) ? width : scaled_width;
+ const int denoise_height = (layer == 0) ? height : scaled_height;
+ for (i = 0; i < init_num_ref_frames; ++i) {
+ fail = vpx_alloc_frame_buffer(
+ &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+ denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, legacy_byte_alignment);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ }
+
+ fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+ denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, legacy_byte_alignment);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+ }
+
+ // denoiser->last_source only used for noise_estimation, so only for top
+ // layer.
+ fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ border, legacy_byte_alignment);
+ if (fail) {
+ vp9_denoiser_free(denoiser);
+ return 1;
+ }
+#ifdef OUTPUT_YUV_DENOISED
+ make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+ denoiser->frame_buffer_initialized = 1;
+ denoiser->denoising_level = kDenMedium;
+ denoiser->prev_denoising_level = kDenMedium;
+ denoiser->reset = 0;
+ denoiser->current_denoiser_frame = 0;
+ return 0;
+}
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser) {
+ int i;
+ if (denoiser == NULL) {
+ return;
+ }
+ denoiser->frame_buffer_initialized = 0;
+ for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
+ vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
+ }
+ vpx_free(denoiser->running_avg_y);
+ denoiser->running_avg_y = NULL;
+
+ for (i = 0; i < denoiser->num_layers; ++i) {
+ vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+ }
+
+ vpx_free(denoiser->mc_running_avg_y);
+ denoiser->mc_running_avg_y = NULL;
+ vpx_free_frame_buffer(&denoiser->last_source);
+}
+
+static void force_refresh_longterm_ref(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ // If long term reference is used, force refresh of that slot, so
+ // denoiser buffer for long term reference stays in sync.
+ if (svc->use_gf_temporal_ref_current_layer) {
+ int index = svc->spatial_layer_id;
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ cpi->refresh_alt_ref_frame = 1;
+ }
+}
+
+void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) {
+ VP9_DENOISER *const denoiser = &cpi->denoiser;
+ denoiser->denoising_level = noise_level;
+ if (denoiser->denoising_level > kDenLowLow &&
+ denoiser->prev_denoising_level == kDenLowLow) {
+ denoiser->reset = 1;
+ force_refresh_longterm_ref(cpi);
+ } else {
+ denoiser->reset = 0;
+ }
+ denoiser->prev_denoising_level = denoiser->denoising_level;
+}
+
+// Scale/increase the partition threshold
+// for denoiser speed-up.
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+ int content_state, int temporal_layer_id) {
+ if ((content_state == kLowSadLowSumdiff) ||
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff) || (noise_level == kDenHigh) ||
+ (temporal_layer_id != 0)) {
+ int64_t scaled_thr =
+ (temporal_layer_id < 2) ? (3 * threshold) >> 1 : (7 * threshold) >> 2;
+ return scaled_thr;
+ } else {
+ return (5 * threshold) >> 2;
+ }
+}
+
+// Scale/increase the ac skip threshold for
+// denoiser speed-up.
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+ VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id) {
+ if (noise_level >= kDenLow && abs_sumdiff < 5)
+ return threshold *= (noise_level == kDenLow) ? 2
+ : (temporal_layer_id == 2) ? 10
+ : 6;
+ else
+ return threshold;
+}
+
+void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) {
+ if (vp9_denoise_svc_non_key(cpi) &&
+ cpi->denoiser.current_denoiser_frame == 0) {
+ cpi->denoiser.reset = 1;
+ force_refresh_longterm_ref(cpi);
+ }
+}
+
+void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ int svc_refresh_denoiser_buffers = 0;
+ int denoise_svc_second_layer = 0;
+ FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
+ cpi->denoiser.current_denoiser_frame++;
+ if (cpi->use_svc) {
+ const int svc_buf_shift =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2
+ ? cpi->denoiser.num_ref_frames
+ : 0;
+ int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ svc_refresh_denoiser_buffers =
+ lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+ denoise_svc_second_layer =
+ svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+ // Check if we need to allocate extra buffers in the denoiser
+ // for refreshed frames.
+ if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift,
+ cpi->refresh_alt_ref_frame,
+ cpi->refresh_golden_frame,
+ cpi->refresh_last_frame, cpi->alt_fb_idx,
+ cpi->gld_fb_idx, cpi->lst_fb_idx))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to re-allocate denoiser for SVC");
+ }
+ vp9_denoiser_update_frame_info(
+ &cpi->denoiser, *cpi->Source, svc, frame_type,
+ cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
+ cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+ cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers,
+ denoise_svc_second_layer);
+ }
+}
+
+#ifdef OUTPUT_YUV_DENOISED
+static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
+ int r, c;
+ uint8_t *u = yuv->u_buffer;
+ uint8_t *v = yuv->v_buffer;
+
+ for (r = 0; r < yuv->uv_height; ++r) {
+ for (c = 0; c < yuv->uv_width; ++c) {
+ u[c] = UINT8_MAX / 2;
+ v[c] = UINT8_MAX / 2;
+ }
+ u += yuv->uv_stride;
+ v += yuv->uv_stride;
+ }
+}
+#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h
new file mode 100644
index 0000000000..1973e98988
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_denoiser.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_
+#define VPX_VP9_ENCODER_VP9_DENOISER_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
+
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
+typedef enum vp9_denoiser_decision {
+ COPY_BLOCK,
+ FILTER_BLOCK,
+ FILTER_ZEROMV_BLOCK
+} VP9_DENOISER_DECISION;
+
+typedef enum vp9_denoiser_level {
+ kDenLowLow,
+ kDenLow,
+ kDenMedium,
+ kDenHigh
+} VP9_DENOISER_LEVEL;
+
+typedef struct vp9_denoiser {
+ YV12_BUFFER_CONFIG *running_avg_y;
+ YV12_BUFFER_CONFIG *mc_running_avg_y;
+ YV12_BUFFER_CONFIG last_source;
+ int frame_buffer_initialized;
+ int reset;
+ int num_ref_frames;
+ int num_layers;
+ unsigned int current_denoiser_frame;
+ VP9_DENOISER_LEVEL denoising_level;
+ VP9_DENOISER_LEVEL prev_denoising_level;
+} VP9_DENOISER;
+
+typedef struct {
+ int64_t zero_last_cost_orig;
+ int *ref_frame_cost;
+ int_mv (*frame_mv)[MAX_REF_FRAMES];
+ int reuse_inter_pred;
+ TX_SIZE best_tx_size;
+ PREDICTION_MODE best_mode;
+ MV_REFERENCE_FRAME best_ref_frame;
+ INTERP_FILTER best_pred_filter;
+ uint8_t best_mode_skip_txfm;
+} VP9_PICKMODE_CTX_DEN;
+
+struct VP9_COMP;
+struct SVC;
+
+void vp9_denoiser_update_frame_info(
+ VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+ FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+ int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+ int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer);
+
+void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+ int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
+ VP9_DENOISER_DECISION *denoiser_decision,
+ int use_gf_temporal_ref);
+
+void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
+
+void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
+ PREDICTION_MODE mode,
+ PICK_MODE_CONTEXT *ctx);
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+ struct SVC *svc, int svc_buf_shift,
+ int refresh_alt, int refresh_gld, int refresh_lst,
+ int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+ int use_svc, int noise_sen, int width, int height,
+ int ssx, int ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ int border);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// This function is used by both c and sse2 denoiser implementations.
+// Define it as a static function within the scope where vp9_denoiser.h
+// is referenced.
+static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
+ int increase_denoising) {
+ return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+}
+#endif
+
+void vp9_denoiser_free(VP9_DENOISER *denoiser);
+
+void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level);
+
+void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi);
+
+int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
+ int content_state, int temporal_layer_id);
+
+int64_t vp9_scale_acskip_thresh(int64_t threshold,
+ VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
+ int temporal_layer_id);
+
+void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_DENOISER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
new file mode 100644
index 0000000000..a979ae1c93
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -0,0 +1,6539 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_tile_common.h"
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_complexity.h"
+#endif
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_variance.h"
+#endif
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_partition_models.h"
+#include "vp9/encoder/vp9_pickmode.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ int output_enabled, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+// This is used as a reference when computing the source variance for the
+// purpose of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+// which will be faster.
+static const uint8_t VP9_VAR_OFFS[64] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16
+};
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
+ unsigned int sse;
+ const unsigned int var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse);
+ return var;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
+ unsigned int var, sse;
+ switch (bd) {
+ case 10:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10), 0, &sse);
+ break;
+ case 12:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12), 0, &sse);
+ break;
+ case 8:
+ default:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse);
+ break;
+ }
+ return var;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
+ return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs),
+ num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
+ return (unsigned int)ROUND64_POWER_OF_TWO(
+ (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd),
+ num_pels_log2_lookup[bs]);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int segment_index) {
+ VP9_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+
+ const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
+ // Initialize the segmentation index as 0.
+ mi->segment_id = 0;
+
+ // Skip the rest if AQ mode is disabled.
+ if (!seg->enabled) return;
+
+ switch (aq_mode) {
+ case CYCLIC_REFRESH_AQ:
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ break;
+#if !CONFIG_REALTIME_ONLY
+ case VARIANCE_AQ:
+ if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+ cpi->force_update_segmentation ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ int min_energy;
+ int max_energy;
+ // Get sub block energy range
+ if (bsize >= BLOCK_32X32) {
+ vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+ &max_energy);
+ } else {
+ min_energy = bsize <= BLOCK_16X16 ? x->mb_energy
+ : vp9_block_energy(cpi, x, bsize);
+ }
+ mi->segment_id = vp9_vaq_segment_id(min_energy);
+ } else {
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+ break;
+ case EQUATOR360_AQ:
+ if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation)
+ mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
+ else
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ break;
+#endif
+ case LOOKAHEAD_AQ:
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ break;
+ case PSNR_AQ: mi->segment_id = segment_index; break;
+ case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break;
+ default:
+ // NO_AQ or PSNR_AQ
+ break;
+ }
+
+ // Set segment index if ROI map or active_map is enabled.
+ if (cpi->roi.enabled || cpi->active_map.enabled)
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+
+ vp9_init_plane_quantizers(cpi, x);
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+ MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col) {
+ const int idx_str = xd->mi_stride * mi_row + mi_col;
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
+ x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const int mi_row,
+ const int mi_col, int *const rdmult) {
+ const VP9_COMMON *const cm = &cpi->common;
+
+ const int bsize_base = BLOCK_16X16;
+ const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base];
+ const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base];
+ const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+ const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+ const int num_bcols =
+ (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w;
+ const int num_brows =
+ (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h;
+ int row, col;
+ double num_of_mi = 0.0;
+ double geom_mean_of_scale = 0.0;
+
+ assert(cpi->oxcf.tuning == VP8_TUNE_SSIM);
+
+ for (row = mi_row / num_8x8_w;
+ row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) {
+ for (col = mi_col / num_8x8_h;
+ col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) {
+ const int index = row * num_cols + col;
+ geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]);
+ num_of_mi += 1.0;
+ }
+ }
+ geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+ *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale);
+ *rdmult = VPXMAX(*rdmult, 0);
+ set_error_per_bit(x, *rdmult);
+ vpx_clear_system_state();
+}
+
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ MvLimits *const mv_limits = &x->mv_limits;
+
+ set_skip_context(xd, mi_row, mi_col);
+
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+
+ // Set up destination pointers.
+ vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ mv_limits->row_min = -(((mi_row + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+ mv_limits->col_min = -(((mi_col + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+ mv_limits->row_max = (cm->mi_rows - mi_row) * MI_SIZE + VP9_INTERP_EXTEND;
+ mv_limits->col_max = (cm->mi_cols - mi_col) * MI_SIZE + VP9_INTERP_EXTEND;
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+ cm->mi_cols);
+
+ // Set up source buffers.
+ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+ // R/D setup.
+ x->rddiv = cpi->rd.RDDIV;
+ x->rdmult = cpi->rd.RDMULT;
+ if (oxcf->tuning == VP8_TUNE_SSIM) {
+ set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+
+ // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
+ xd->tile = *tile;
+}
+
+static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int block_width =
+ VPXMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col);
+ const int block_height =
+ VPXMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row);
+ const int mi_stride = xd->mi_stride;
+ MODE_INFO *const src_mi = xd->mi[0];
+ int i, j;
+
+ for (j = 0; j < block_height; ++j)
+ for (i = 0; i < block_width; ++i) xd->mi[j * mi_stride + i] = src_mi;
+}
+
+static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+ set_mode_info_offsets(&cpi->common, x, xd, mi_row, mi_col);
+ xd->mi[0]->sb_type = bsize;
+ }
+}
+
+typedef struct {
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even
+ // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16
+ // * 16 = 2^32).
+ uint32_t sum_square_error;
+ int32_t sum_error;
+ int log2_count;
+ int variance;
+} Var;
+
+typedef struct {
+ Var none;
+ Var horz[2];
+ Var vert[2];
+} partition_variance;
+
+typedef struct {
+ partition_variance part_variances;
+ Var split[4];
+} v4x4;
+
+typedef struct {
+ partition_variance part_variances;
+ v4x4 split[4];
+} v8x8;
+
+typedef struct {
+ partition_variance part_variances;
+ v8x8 split[4];
+} v16x16;
+
+typedef struct {
+ partition_variance part_variances;
+ v16x16 split[4];
+} v32x32;
+
+typedef struct {
+ partition_variance part_variances;
+ v32x32 split[4];
+} v64x64;
+
+typedef struct {
+ partition_variance *part_variances;
+ Var *split[4];
+} variance_node;
+
+typedef enum {
+ V16X16,
+ V32X32,
+ V64X64,
+} TREE_LEVEL;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+ int i;
+ node->part_variances = NULL;
+ switch (bsize) {
+ case BLOCK_64X64: {
+ v64x64 *vt = (v64x64 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_32X32: {
+ v32x32 *vt = (v32x32 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_16X16: {
+ v16x16 *vt = (v16x16 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_8X8: {
+ v8x8 *vt = (v8x8 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ default: {
+ v4x4 *vt = (v4x4 *)data;
+ assert(bsize == BLOCK_4X4);
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
+ break;
+ }
+ }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+}
+
+static void get_variance(Var *v) {
+ v->variance =
+ (int)(256 * (v->sum_square_error -
+ (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+ v->log2_count)) >>
+ v->log2_count);
+}
+
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+ variance_node node;
+ memset(&node, 0, sizeof(node));
+ tree_to_node(data, bsize, &node);
+ sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+ sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+ sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+ sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+ sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+ &node.part_variances->none);
+}
+
+static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, void *data,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int64_t threshold, BLOCK_SIZE bsize_min,
+ int force_split) {
+ VP9_COMMON *const cm = &cpi->common;
+ variance_node vt;
+ const int block_width = num_8x8_blocks_wide_lookup[bsize];
+ const int block_height = num_8x8_blocks_high_lookup[bsize];
+
+ assert(block_height == block_width);
+ tree_to_node(data, bsize, &vt);
+
+ if (force_split == 1) return 0;
+
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (bsize == bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ return 1;
+ }
+ return 0;
+ } else if (bsize > bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (frame_is_intra_only(cm) &&
+ (bsize > BLOCK_32X32 ||
+ vt.part_variances->none.variance > (threshold << 4))) {
+ return 0;
+ }
+ // If variance is low, take the bsize (no split).
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ return 1;
+ }
+
+ // Check vertical split.
+ if (mi_row + block_height / 2 < cm->mi_rows) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold &&
+ vt.part_variances->vert[1].variance < threshold &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
+ }
+ // Check horizontal split.
+ if (mi_col + block_width / 2 < cm->mi_cols) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold &&
+ vt.part_variances->horz[1].variance < threshold &&
+ get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+ return 0;
+}
+
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+ int width, int height,
+ int content_state) {
+ if (speed >= 8) {
+ if (width <= 640 && height <= 480)
+ return (5 * threshold_base) >> 2;
+ else if ((content_state == kLowSadLowSumdiff) ||
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff))
+ return (5 * threshold_base) >> 2;
+ } else if (speed == 7) {
+ if ((content_state == kLowSadLowSumdiff) ||
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff)) {
+ return (5 * threshold_base) >> 2;
+ }
+ }
+ return threshold_base;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
+ int content_state) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int threshold_multiplier =
+ is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult;
+ int64_t threshold_base =
+ (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
+
+ if (is_key_frame) {
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base >> 2;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base << 2;
+ } else {
+ // Increase base variance threshold based on estimated noise level.
+ if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) {
+ NOISE_LEVEL noise_level =
+ vp9_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level == kHigh)
+ threshold_base = 3 * threshold_base;
+ else if (noise_level == kMedium)
+ threshold_base = threshold_base << 1;
+ else if (noise_level < kLow)
+ threshold_base = (7 * threshold_base) >> 3;
+ }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5 && cpi->denoiser.denoising_level >= kDenLow)
+ threshold_base =
+ vp9_scale_part_thresh(threshold_base, cpi->denoiser.denoising_level,
+ content_state, cpi->svc.temporal_layer_id);
+ else
+ threshold_base =
+ scale_part_thresh_sumdiff(threshold_base, cpi->oxcf.speed, cm->width,
+ cm->height, content_state);
+#else
+ // Increase base variance threshold based on content_state/sum_diff level.
+ threshold_base = scale_part_thresh_sumdiff(
+ threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+#endif
+ thresholds[0] = threshold_base;
+ thresholds[2] = threshold_base << cpi->oxcf.speed;
+ if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7)
+ thresholds[2] = thresholds[2] << 1;
+ if (cm->width <= 352 && cm->height <= 288) {
+ thresholds[0] = threshold_base >> 3;
+ thresholds[1] = threshold_base >> 1;
+ thresholds[2] = threshold_base << 3;
+ if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 220)
+ thresholds[2] = thresholds[2] << 2;
+ else if (cpi->rc.avg_frame_qindex[INTER_FRAME] > 200)
+ thresholds[2] = thresholds[2] << 1;
+ } else if (cm->width < 1280 && cm->height < 720) {
+ thresholds[1] = (5 * threshold_base) >> 2;
+ } else if (cm->width < 1920 && cm->height < 1080) {
+ thresholds[1] = threshold_base << 1;
+ } else {
+ thresholds[1] = (5 * threshold_base) >> 1;
+ }
+ if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX;
+ }
+}
+
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q,
+ int content_state) {
+ VP9_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int is_key_frame = frame_is_intra_only(cm);
+ if (sf->partition_search_type != VAR_BASED_PARTITION &&
+ sf->partition_search_type != REFERENCE_PARTITION) {
+ return;
+ } else {
+ set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
+ // The thresholds below are not changed locally.
+ if (is_key_frame) {
+ cpi->vbp_threshold_sad = 0;
+ cpi->vbp_threshold_copy = 0;
+ cpi->vbp_bsize_min = BLOCK_8X8;
+ } else {
+ if (cm->width <= 352 && cm->height <= 288)
+ cpi->vbp_threshold_sad = 10;
+ else
+ cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000
+ ? (cpi->y_dequant[q][1] << 1)
+ : 1000;
+ cpi->vbp_bsize_min = BLOCK_16X16;
+ if (cm->width <= 352 && cm->height <= 288)
+ cpi->vbp_threshold_copy = 4000;
+ else if (cm->width <= 640 && cm->height <= 360)
+ cpi->vbp_threshold_copy = 8000;
+ else
+ cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000
+ ? (cpi->y_dequant[q][1] << 3)
+ : 8000;
+ if (cpi->rc.high_source_sad ||
+ (cpi->use_svc && cpi->svc.high_source_sad_superframe)) {
+ cpi->vbp_threshold_sad = 0;
+ cpi->vbp_threshold_copy = 0;
+ }
+ }
+ cpi->vbp_threshold_minmax = 15 + (q >> 3);
+ }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high) {
+ int k;
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+ d + y8_idx * dp + x8_idx, dp, &min, &max);
+ } else {
+ vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx,
+ dp, &min, &max);
+ }
+#else
+ vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+ &min, &max);
+#endif
+ if ((max - min) > minmax_max) minmax_max = (max - min);
+ if ((max - min) < minmax_min) minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x4_idx = x8_idx + ((k & 1) << 2);
+ int y4_idx = y8_idx + ((k >> 1) << 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame)
+ d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ } else {
+ s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame) d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ }
+#else
+ s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame) d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int highbd_flag,
+#endif
+ int pixels_wide, int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+ s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame)
+ d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ } else {
+ s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame) d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+ }
+#else
+ s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame) d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+// Check if most of the superblock is skin content, and if so, force split to
+// 32x32, and set x->sb_is_skin for use in mode selection.
+static int skin_sb_split(VP9_COMP *cpi, const int low_res, int mi_row,
+ int mi_col, int *force_split) {
+ VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) return 0;
+#endif
+ // Avoid checking superblocks on/near boundary and avoid low resolutions.
+ // Note superblock may still pick 64X64 if y_sad is very small
+ // (i.e., y_sad < cpi->vbp_threshold_sad) below. For now leave this as is.
+ if (!low_res && (mi_col >= 8 && mi_col + 8 < cm->mi_cols && mi_row >= 8 &&
+ mi_row + 8 < cm->mi_rows)) {
+ int num_16x16_skin = 0;
+ int num_16x16_nonskin = 0;
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ // Loop through the 16x16 sub-blocks.
+ int i, j;
+ for (i = 0; i < ymis; i += 2) {
+ for (j = 0; j < xmis; j += 2) {
+ int bl_index = block_index + i * cm->mi_cols + j;
+ int is_skin = cpi->skin_map[bl_index];
+ num_16x16_skin += is_skin;
+ num_16x16_nonskin += (1 - is_skin);
+ if (num_16x16_nonskin > 3) {
+ // Exit loop if at least 4 of the 16x16 blocks are not skin.
+ i = ymis;
+ break;
+ }
+ }
+ }
+ if (num_16x16_skin > 12) {
+ *force_split = 1;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ v64x64 *vt, int64_t thresholds[],
+ MV_REFERENCE_FRAME ref_frame_partition,
+ int mi_col, int mi_row) {
+ int i, j;
+ VP9_COMMON *const cm = &cpi->common;
+ const int mv_thr = cm->width > 640 ? 8 : 4;
+ // Check temporal variance for bsize >= 16x16, if LAST_FRAME was selected and
+ // int_pro mv is small. If the temporal variance is small set the flag
+ // variance_low for the block. The variance threshold can be adjusted, the
+ // higher the more aggressive.
+ if (ref_frame_partition == LAST_FRAME &&
+ (cpi->sf.short_circuit_low_temp_var == 1 ||
+ (xd->mi[0]->mv[0].as_mv.col < mv_thr &&
+ xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
+ xd->mi[0]->mv[0].as_mv.row < mv_thr &&
+ xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
+ if (xd->mi[0]->sb_type == BLOCK_64X64) {
+ if ((vt->part_variances).none.variance < (thresholds[0] >> 1))
+ x->variance_low[0] = 1;
+ } else if (xd->mi[0]->sb_type == BLOCK_64X32) {
+ for (i = 0; i < 2; i++) {
+ if (vt->part_variances.horz[i].variance < (thresholds[0] >> 2))
+ x->variance_low[i + 1] = 1;
+ }
+ } else if (xd->mi[0]->sb_type == BLOCK_32X64) {
+ for (i = 0; i < 2; i++) {
+ if (vt->part_variances.vert[i].variance < (thresholds[0] >> 2))
+ x->variance_low[i + 3] = 1;
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const int idx[4][2] = { { 0, 0 }, { 0, 4 }, { 4, 0 }, { 4, 4 } };
+ const int idx_str =
+ cm->mi_stride * (mi_row + idx[i][0]) + mi_col + idx[i][1];
+ MODE_INFO **this_mi = cm->mi_grid_visible + idx_str;
+
+ if (cm->mi_cols <= mi_col + idx[i][1] ||
+ cm->mi_rows <= mi_row + idx[i][0])
+ continue;
+
+ if ((*this_mi)->sb_type == BLOCK_32X32) {
+ int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3)
+ ? ((5 * thresholds[1]) >> 3)
+ : (thresholds[1] >> 1);
+ if (vt->split[i].part_variances.none.variance < threshold_32x32)
+ x->variance_low[i + 5] = 1;
+ } else if (cpi->sf.short_circuit_low_temp_var >= 2) {
+ // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block
+ // inside.
+ if ((*this_mi)->sb_type == BLOCK_16X16 ||
+ (*this_mi)->sb_type == BLOCK_32X16 ||
+ (*this_mi)->sb_type == BLOCK_16X32) {
+ for (j = 0; j < 4; j++) {
+ if (vt->split[i].split[j].part_variances.none.variance <
+ (thresholds[2] >> 8))
+ x->variance_low[(i << 2) + j + 9] = 1;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->prev_partition;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ partition = partition_lookup[bsl][prev_part[start_pos]];
+ subsize = get_subsize(bsize, partition);
+
+ if (subsize < BLOCK_8X8) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ break;
+ case PARTITION_HORZ:
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
+ break;
+ case PARTITION_VERT:
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
+ break;
+ }
+ }
+}
+
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ int mi_row, int mi_col, int segment_id,
+ int sb_offset) {
+ int svc_copy_allowed = 1;
+ int frames_since_key_thresh = 1;
+ if (cpi->use_svc) {
+ // For SVC, don't allow copy if base spatial layer is key frame, or if
+ // frame is not a temporal enhancement layer frame.
+ int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0;
+ frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+ }
+ if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+ !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
+ cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
+ cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
+ if (cpi->prev_partition != NULL) {
+ copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
+ cpi->copied_frame_cnt[sb_offset] += 1;
+ memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
+ sizeof(x->variance_low));
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int mi_row_high, int mi_col_high) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ BLOCK_SIZE *prev_part = svc->prev_partition_svc;
+ // Variables with _high are for higher resolution.
+ int bsize_high = 0;
+ int subsize_high = 0;
+ const int bsl_high = b_width_log2_lookup[bsize];
+ const int bs_high = (1 << bsl_high) >> 2;
+ const int has_rows = (mi_row_high + bs_high) < cm->mi_rows;
+ const int has_cols = (mi_col_high + bs_high) < cm->mi_cols;
+
+ const int row_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 1, 0,
+ 1, 1, 0, 1, 1,
+ 0, 1, 0 };
+ const int col_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 2, 2,
+ 0, 2, 2, 0, 2,
+ 2, 0, 0 };
+ int start_pos;
+ BLOCK_SIZE bsize_low;
+ PARTITION_TYPE partition_high;
+
+ if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0;
+ if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] ||
+ mi_col >= svc->mi_cols[svc->spatial_layer_id - 1])
+ return 0;
+
+ // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
+ start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
+ bsize_low = prev_part[start_pos];
+ // The block size is too big for boundaries. Do variance based partitioning.
+ if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1;
+
+ // For reference frames: return 1 (do variance-based partitioning) if the
+ // superblock is not low source sad and lower-resoln bsize is below 32x32.
+ if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad &&
+ bsize_low < BLOCK_32X32)
+ return 1;
+
+ // Scale up block size by 2x2. Force 64x64 for size larger than 32x32.
+ if (bsize_low < BLOCK_32X32) {
+ bsize_high = bsize_low + 3;
+ } else if (bsize_low >= BLOCK_32X32) {
+ bsize_high = BLOCK_64X64;
+ }
+ // Scale up blocks on boundary.
+ if (!has_cols && has_rows) {
+ bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low];
+ } else if (has_cols && !has_rows) {
+ bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low];
+ } else if (!has_cols && !has_rows) {
+ bsize_high = bsize_low;
+ }
+
+ partition_high = partition_lookup[bsl_high][bsize_high];
+ subsize_high = get_subsize(bsize, partition_high);
+
+ if (subsize_high < BLOCK_8X8) {
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+ } else {
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ switch (partition_high) {
+ case PARTITION_NONE:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+ break;
+ case PARTITION_HORZ:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+ if (subsize_high < BLOCK_64X64)
+ set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high,
+ subsize_high);
+ break;
+ case PARTITION_VERT:
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+ if (subsize_high < BLOCK_64X64)
+ set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
+ subsize_high);
+ break;
+ default:
+ assert(partition_high == PARTITION_SPLIT);
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
+ mi_row_high, mi_col_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+ mi_col, mi_row_high + bs_high, mi_col_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row,
+ mi_col + (bs >> 1), mi_row_high,
+ mi_col_high + bs_high))
+ return 1;
+ if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+ mi_col + (bs >> 1), mi_row_high + bs_high,
+ mi_col_high + bs_high))
+ return 1;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+ const MODE_INFO *mi = NULL;
+ int xx, yy;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ mi = cm->mi_grid_visible[start_pos];
+ partition = partition_lookup[bsl][mi->sb_type];
+ subsize = get_subsize(bsize, partition);
+ if (subsize < BLOCK_8X8) {
+ prev_part[start_pos] = bsize;
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ prev_part[start_pos] = bsize;
+ if (bsize == BLOCK_64X64) {
+ for (xx = 0; xx < 8; xx += 4)
+ for (yy = 0; yy < 8; yy += 4) {
+ if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols))
+ prev_part[start_pos + xx * cm->mi_stride + yy] = bsize;
+ }
+ }
+ break;
+ case PARTITION_HORZ:
+ prev_part[start_pos] = subsize;
+ if (mi_row + bs < cm->mi_rows)
+ prev_part[start_pos + bs * cm->mi_stride] = subsize;
+ break;
+ case PARTITION_VERT:
+ prev_part[start_pos] = subsize;
+ if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ update_partition_svc(cpi, subsize, mi_row, mi_col);
+ update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
+ update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
+ update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
+ break;
+ }
+ }
+}
+
+static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ BLOCK_SIZE *prev_part = cpi->prev_partition;
+ int start_pos = mi_row * cm->mi_stride + mi_col;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int bs = (1 << bsl) >> 2;
+ BLOCK_SIZE subsize;
+ PARTITION_TYPE partition;
+ const MODE_INFO *mi = NULL;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ mi = cm->mi_grid_visible[start_pos];
+ partition = partition_lookup[bsl][mi->sb_type];
+ subsize = get_subsize(bsize, partition);
+ if (subsize < BLOCK_8X8) {
+ prev_part[start_pos] = bsize;
+ } else {
+ switch (partition) {
+ case PARTITION_NONE: prev_part[start_pos] = bsize; break;
+ case PARTITION_HORZ:
+ prev_part[start_pos] = subsize;
+ if (mi_row + bs < cm->mi_rows)
+ prev_part[start_pos + bs * cm->mi_stride] = subsize;
+ break;
+ case PARTITION_VERT:
+ prev_part[start_pos] = subsize;
+ if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
+ update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
+ update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
+ update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
+ break;
+ }
+ }
+}
+
+static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id,
+ int mi_row, int mi_col, int sb_offset) {
+ update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+ cpi->prev_segment_id[sb_offset] = segment_id;
+ memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
+ sizeof(x->variance_low));
+ // Reset the counter for copy partitioning
+ cpi->copied_frame_cnt[sb_offset] = 0;
+}
+
+static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
+ unsigned int y_sad, int is_key_frame,
+ int scene_change_detected) {
+ int i;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int shift = 2;
+
+ if (is_key_frame) return;
+
+ // For speed > 8, avoid the chroma check if y_sad is above threshold.
+ if (cpi->oxcf.speed > 8) {
+ if (y_sad > cpi->vbp_thresholds[1] &&
+ (!cpi->noise_estimate.enabled ||
+ vp9_noise_estimate_extract_level(&cpi->noise_estimate) < kMedium))
+ return;
+ }
+
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && scene_change_detected)
+ shift = 5;
+
+ for (i = 1; i <= 2; ++i) {
+ unsigned int uv_sad = UINT_MAX;
+ struct macroblock_plane *p = &x->plane[i];
+ struct macroblockd_plane *pd = &xd->plane[i];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+ if (bs != BLOCK_INVALID)
+ uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride);
+
+ // TODO(marpan): Investigate if we should lower this threshold if
+ // superblock is detected as skin.
+ x->color_sensitivity[i - 1] = uv_sad > (y_sad >> shift);
+ }
+}
+
+static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
+ int sb_offset) {
+ unsigned int tmp_sse;
+ uint64_t tmp_sad;
+ unsigned int tmp_variance;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ uint8_t *src_y = cpi->Source->y_buffer;
+ int src_ystride = cpi->Source->y_stride;
+ uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+ int last_src_ystride = cpi->Last_Source->y_stride;
+ uint64_t avg_source_sad_threshold = 10000;
+ uint64_t avg_source_sad_threshold2 = 12000;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) return 0;
+#endif
+ src_y += shift;
+ last_src_y += shift;
+ tmp_sad =
+ cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride);
+ tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y,
+ last_src_ystride, &tmp_sse);
+ // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12)
+ if (tmp_sad < avg_source_sad_threshold)
+ x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff
+ : kLowSadHighSumdiff;
+ else
+ x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff
+ : kHighSadHighSumdiff;
+
+ // Detect large lighting change.
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) &&
+ (tmp_sse - tmp_variance) > 10000)
+ x->content_state_sb = kLowVarHighSumdiff;
+ else if (tmp_sad > (avg_source_sad_threshold << 1))
+ x->content_state_sb = kVeryHighSad;
+
+ if (cpi->content_state_sb_fd != NULL) {
+ if (tmp_sad < avg_source_sad_threshold2) {
+ // Cap the increment to 255.
+ if (cpi->content_state_sb_fd[sb_offset] < 255)
+ cpi->content_state_sb_fd[sb_offset]++;
+ } else {
+ cpi->content_state_sb_fd[sb_offset] = 0;
+ }
+ }
+ if (tmp_sad == 0) x->zero_temp_sad_source = 1;
+ return tmp_sad;
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int i, j, k, m;
+ v64x64 vt;
+ v16x16 *vt2 = NULL;
+ int force_split[21];
+ int avg_32x32;
+ int max_var_32x32 = 0;
+ int min_var_32x32 = INT_MAX;
+ int var_32x32;
+ int avg_16x16[4];
+ int maxvar_16x16[4];
+ int minvar_16x16[4];
+ int64_t threshold_4x4avg;
+ NOISE_LEVEL noise_level = kLow;
+ int content_state = 0;
+ uint8_t *s;
+ const uint8_t *d;
+ int sp;
+ int dp;
+ int compute_minmax_variance = 1;
+ unsigned int y_sad = UINT_MAX;
+ BLOCK_SIZE bsize = BLOCK_64X64;
+ // Ref frame used in partitioning.
+ MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME;
+ int pixels_wide = 64, pixels_high = 64;
+ int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+ cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] };
+ int scene_change_detected =
+ cpi->rc.high_source_sad ||
+ (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+ int force_64_split = scene_change_detected ||
+ (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->compute_source_sad_onepass &&
+ cpi->sf.use_source_sad && !x->zero_temp_sad_source);
+
+ // For the variance computation under SVC mode, we treat the frame as key if
+ // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
+ int is_key_frame =
+ (frame_is_intra_only(cm) ||
+ (is_one_pass_svc(cpi) &&
+ cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+ // Always use 4x4 partition for key frame.
+ const int use_4x4_partition = frame_is_intra_only(cm);
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ int variance4x4downsample[16];
+ int segment_id;
+ int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3);
+
+ // For SVC: check if LAST frame is NULL or if the resolution of LAST is
+ // different than the current frame resolution, and if so, treat this frame
+ // as a key frame, for the purpose of the superblock partitioning.
+ // LAST == NULL can happen in some cases where enhancement spatial layers are
+ // enabled dyanmically in the stream and the only reference is the spatial
+ // reference (GOLDEN).
+ if (cpi->use_svc) {
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME);
+ if (ref == NULL || ref->y_crop_height != cm->height ||
+ ref->y_crop_width != cm->width)
+ is_key_frame = 1;
+ }
+
+ set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+ set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+ segment_id = xd->mi[0]->segment_id;
+
+ if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
+ compute_minmax_variance = 0;
+
+ memset(x->variance_low, 0, sizeof(x->variance_low));
+
+ if (cpi->sf.use_source_sad && !is_key_frame) {
+ int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ content_state = x->content_state_sb;
+ x->skip_low_source_sad = (content_state == kLowSadLowSumdiff ||
+ content_state == kLowSadHighSumdiff)
+ ? 1
+ : 0;
+ x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
+ if (cpi->content_state_sb_fd != NULL)
+ x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
+
+ // For SVC on top spatial layer: use/scale the partition from
+ // the lower spatial resolution if svc_use_lowres_part is enabled.
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) {
+ if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1,
+ mi_col >> 1, mi_row, mi_col)) {
+ if (cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+ }
+ return 0;
+ }
+ }
+ // If source_sad is low copy the partition without computing the y_sad.
+ if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
+ !force_64_split &&
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+ x->sb_use_mv_part = 1;
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+ return 0;
+ }
+ }
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ cyclic_refresh_segment_id_boosted(segment_id)) {
+ int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ set_vbp_thresholds(cpi, thresholds, q, content_state);
+ } else {
+ set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+ }
+ // Decrease 32x32 split threshold for screen on base layer, for scene
+ // change/high motion frames.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->svc.spatial_layer_id == 0 && force_64_split)
+ thresholds[1] = 3 * thresholds[1] >> 2;
+
+ // For non keyframes, disable 4x4 average for low resolution when speed = 8
+ threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
+
+ if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+ if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+ s = x->plane[0].src.buf;
+ sp = x->plane[0].src.stride;
+
+ // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+ // 5-20 for the 16x16 blocks.
+ force_split[0] = force_64_split;
+
+ if (!is_key_frame) {
+ // In the case of spatial/temporal scalable coding, the assumption here is
+ // that the temporal reference frame will always be of type LAST_FRAME.
+ // TODO(marpan): If that assumption is broken, we need to revisit this code.
+ MODE_INFO *mi = xd->mi[0];
+ YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+ const YV12_BUFFER_CONFIG *yv12_g = NULL;
+ unsigned int y_sad_g, y_sad_thr, y_sad_last;
+ bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
+ (mi_row + 4 < cm->mi_rows);
+
+ assert(yv12 != NULL);
+
+ if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
+ cpi->svc.use_gf_temporal_ref_current_layer) {
+ // For now, GOLDEN will not be used for non-zero spatial layers, since
+ // it may not be a temporal reference.
+ yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ }
+
+ // Only compute y_sad_g (sad for golden reference) for speed < 8.
+ if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ y_sad_g = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ } else {
+ y_sad_g = UINT_MAX;
+ }
+
+ if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+ cpi->rc.is_src_frame_alt_ref) {
+ yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[ALTREF_FRAME - 1].sf);
+ mi->ref_frame[0] = ALTREF_FRAME;
+ y_sad_g = UINT_MAX;
+ } else {
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[LAST_FRAME - 1].sf);
+ mi->ref_frame[0] = LAST_FRAME;
+ }
+ mi->ref_frame[1] = NONE;
+ mi->sb_type = BLOCK_64X64;
+ mi->mv[0].as_int = 0;
+ mi->interp_filter = BILINEAR;
+
+ if (cpi->oxcf.speed >= 8 && !low_res &&
+ x->content_state_sb != kVeryHighSad) {
+ y_sad = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ } else {
+ const MV dummy_mv = { 0, 0 };
+ y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+ &dummy_mv);
+ x->sb_use_mv_part = 1;
+ x->sb_mvcol_part = mi->mv[0].as_mv.col;
+ x->sb_mvrow_part = mi->mv[0].as_mv.row;
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+ cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source &&
+ cm->width > 640 && cm->height > 480) {
+ // Disable split below 16x16 block size when scroll motion (horz or
+ // vert) is detected.
+ // TODO(marpan/jianj): Improve this condition: issue is that search
+ // range is hard-coded/limited in vp9_int_pro_motion_estimation() so
+ // scroll motion may not be detected here.
+ if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) ||
+ (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) &&
+ y_sad < 100000) {
+ compute_minmax_variance = 0;
+ thresholds[2] = INT64_MAX;
+ }
+ }
+ }
+
+ y_sad_last = y_sad;
+ // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+ // are close if short_circuit_low_temp_var is on.
+ y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+ if (y_sad_g < y_sad_thr) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ mi->ref_frame[0] = GOLDEN_FRAME;
+ mi->mv[0].as_int = 0;
+ y_sad = y_sad_g;
+ ref_frame_partition = GOLDEN_FRAME;
+ } else {
+ x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+ ref_frame_partition = LAST_FRAME;
+ }
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+
+ if (cpi->use_skin_detection)
+ x->sb_is_skin = skin_sb_split(cpi, low_res, mi_row, mi_col, force_split);
+
+ d = xd->plane[0].dst.buf;
+ dp = xd->plane[0].dst.stride;
+
+ // If the y_sad is very small, take 64x64 as partition and exit.
+ // Don't check on boosted segment for now, as 64x64 is suppressed there.
+ if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
+ const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
+ x->variance_low[0] = 1;
+ chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+ if (cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+ }
+ return 0;
+ }
+ }
+
+ // If the y_sad is small enough, copy the partition of the superblock in the
+ // last frame to current frame only if the last frame is not a keyframe.
+ // Stop the copy every cpi->max_copied_frame to refresh the partition.
+ // TODO(jianj) : tune the threshold.
+ if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+ chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+ if (cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+ return 0;
+ }
+ } else {
+ d = VP9_VAR_OFFS;
+ dp = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (xd->bd) {
+ case 10: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10); break;
+ case 12: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12); break;
+ case 8:
+ default: d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8); break;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (low_res && threshold_4x4avg < INT64_MAX)
+ CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
+ // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+ // for splits.
+ for (i = 0; i < 4; i++) {
+ const int x32_idx = ((i & 1) << 5);
+ const int y32_idx = ((i >> 1) << 5);
+ const int i2 = i << 2;
+ force_split[i + 1] = 0;
+ avg_16x16[i] = 0;
+ maxvar_16x16[i] = 0;
+ minvar_16x16[i] = INT_MAX;
+ for (j = 0; j < 4; j++) {
+ const int x16_idx = x32_idx + ((j & 1) << 4);
+ const int y16_idx = y32_idx + ((j >> 1) << 4);
+ const int split_index = 5 + i2 + j;
+ v16x16 *vst = &vt.split[i].split[j];
+ force_split[split_index] = 0;
+ variance4x4downsample[i2 + j] = 0;
+ if (!is_key_frame) {
+ fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high, is_key_frame);
+ fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+ get_variance(&vt.split[i].split[j].part_variances.none);
+ avg_16x16[i] += vt.split[i].split[j].part_variances.none.variance;
+ if (vt.split[i].split[j].part_variances.none.variance < minvar_16x16[i])
+ minvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
+ if (vt.split[i].split[j].part_variances.none.variance > maxvar_16x16[i])
+ maxvar_16x16[i] = vt.split[i].split[j].part_variances.none.variance;
+ if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) {
+ // 16X16 variance is above threshold for split, so force split to 8x8
+ // for this 16x16 block (this also forces splits for upper levels).
+ force_split[split_index] = 1;
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ } else if (compute_minmax_variance &&
+ vt.split[i].split[j].part_variances.none.variance >
+ thresholds[1] &&
+ !cyclic_refresh_segment_id_boosted(segment_id)) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+ // force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high);
+ int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+ if (x->content_state_sb == kVeryHighSad)
+ thresh_minmax = thresh_minmax << 1;
+ if (minmax > thresh_minmax) {
+ force_split[split_index] = 1;
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ }
+ }
+ }
+ if (is_key_frame ||
+ (low_res && vt.split[i].split[j].part_variances.none.variance >
+ threshold_4x4avg)) {
+ force_split[split_index] = 0;
+ // Go down to 4x4 down-sampling for variance.
+ variance4x4downsample[i2 + j] = 1;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+ fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf->flags,
+#endif
+ pixels_wide, pixels_high, is_key_frame);
+ }
+ }
+ }
+ }
+ if (cpi->noise_estimate.enabled)
+ noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate);
+ // Fill the rest of the variance tree by summing split partition values.
+ avg_32x32 = 0;
+ for (i = 0; i < 4; i++) {
+ const int i2 = i << 2;
+ for (j = 0; j < 4; j++) {
+ if (variance4x4downsample[i2 + j] == 1) {
+ v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] : &vt.split[i].split[j];
+ for (m = 0; m < 4; m++) fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+ fill_variance_tree(vtemp, BLOCK_16X16);
+ // If variance of this 16x16 block is above the threshold, force block
+ // to split. This also forces a split on the upper levels.
+ get_variance(&vtemp->part_variances.none);
+ if (vtemp->part_variances.none.variance > thresholds[2]) {
+ force_split[5 + i2 + j] = 1;
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ }
+ }
+ }
+ fill_variance_tree(&vt.split[i], BLOCK_32X32);
+ // If variance of this 32x32 block is above the threshold, or if its above
+ // (some threshold of) the average variance over the sub-16x16 blocks, then
+ // force this block to split. This also forces a split on the upper
+ // (64x64) level.
+ if (!force_split[i + 1]) {
+ get_variance(&vt.split[i].part_variances.none);
+ var_32x32 = vt.split[i].part_variances.none.variance;
+ max_var_32x32 = VPXMAX(var_32x32, max_var_32x32);
+ min_var_32x32 = VPXMIN(var_32x32, min_var_32x32);
+ if (vt.split[i].part_variances.none.variance > thresholds[1] ||
+ (!is_key_frame &&
+ vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) &&
+ vt.split[i].part_variances.none.variance > (avg_16x16[i] >> 1))) {
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ } else if (!is_key_frame && noise_level < kLow && cm->height <= 360 &&
+ (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[1] >> 1) &&
+ maxvar_16x16[i] > thresholds[1]) {
+ force_split[i + 1] = 1;
+ force_split[0] = 1;
+ }
+ avg_32x32 += var_32x32;
+ }
+ }
+ if (!force_split[0]) {
+ fill_variance_tree(&vt, BLOCK_64X64);
+ get_variance(&vt.part_variances.none);
+ // If variance of this 64x64 block is above (some threshold of) the average
+ // variance over the sub-32x32 blocks, then force this block to split.
+ // Only checking this for noise level >= medium for now.
+ if (!is_key_frame && noise_level >= kMedium &&
+ vt.part_variances.none.variance > (9 * avg_32x32) >> 5)
+ force_split[0] = 1;
+ // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in
+ // a 64x64 block is greater than threshold and the maximum 32x32 variance is
+ // above a miniumum threshold, then force the split of a 64x64 block
+ // Only check this for low noise.
+ else if (!is_key_frame && noise_level < kMedium &&
+ (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) &&
+ max_var_32x32 > thresholds[0] >> 1)
+ force_split[0] = 1;
+ }
+
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
+ if (mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
+ !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+ thresholds[0], BLOCK_16X16, force_split[0])) {
+ for (i = 0; i < 4; ++i) {
+ const int x32_idx = ((i & 1) << 2);
+ const int y32_idx = ((i >> 1) << 2);
+ const int i2 = i << 2;
+ if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
+ (mi_row + y32_idx), (mi_col + x32_idx),
+ thresholds[1], BLOCK_16X16,
+ force_split[i + 1])) {
+ for (j = 0; j < 4; ++j) {
+ const int x16_idx = ((j & 1) << 1);
+ const int y16_idx = ((j >> 1) << 1);
+ // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+ // block, then the variance is based on 4x4 down-sampling, so use vt2
+ // in set_vt_partioning(), otherwise use vt.
+ v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
+ ? &vt2[i2 + j]
+ : &vt.split[i].split[j];
+ if (!set_vt_partitioning(
+ cpi, x, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx,
+ mi_col + x32_idx + x16_idx, thresholds[2], cpi->vbp_bsize_min,
+ force_split[5 + i2 + j])) {
+ for (k = 0; k < 4; ++k) {
+ const int x8_idx = (k & 1);
+ const int y8_idx = (k >> 1);
+ if (use_4x4_partition) {
+ if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
+ BLOCK_8X8,
+ mi_row + y32_idx + y16_idx + y8_idx,
+ mi_col + x32_idx + x16_idx + x8_idx,
+ thresholds[3], BLOCK_8X8, 0)) {
+ set_block_size(
+ cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_4X4);
+ }
+ } else {
+ set_block_size(
+ cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) {
+ update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+ }
+
+ if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+ update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+
+ if (cpi->sf.short_circuit_low_temp_var) {
+ set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
+ mi_col, mi_row);
+ }
+
+ chroma_check(cpi, x, bsize, y_sad, is_key_frame, scene_change_detected);
+ if (vt2) vpx_free(vt2);
+ return 0;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int output_enabled) {
+ int i, x_idx, y;
+ VP9_COMMON *const cm = &cpi->common;
+ RD_COUNTS *const rdc = &td->rd_counts;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ MODE_INFO *mi = &ctx->mic;
+ MODE_INFO *const xdmi = xd->mi[0];
+ MODE_INFO *mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+ const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
+ const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+ MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+ const int mis = cm->mi_stride;
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int max_plane;
+
+ assert(mi->sb_type == bsize);
+
+ *mi_addr = *mi;
+ *x->mbmi_ext = ctx->mbmi_ext;
+
+ // If segmentation in use
+ if (seg->enabled) {
+ // For in frame complexity AQ copy the segment id from the segment map.
+ if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mi_addr->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+ // Else for cyclic refresh mode update the segment map, set the segment id
+ // and then update the quantizer.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->content_mode) {
+ vp9_cyclic_refresh_update_segment(cpi, xd->mi[0], mi_row, mi_col, bsize,
+ ctx->rate, ctx->dist, x->skip, p);
+ }
+ }
+
+ max_plane = is_inter_block(xdmi) ? MAX_MB_PLANE : 1;
+ for (i = 0; i < max_plane; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][1];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+ p[i].eobs = ctx->eobs_pbuf[i][1];
+ }
+
+ for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][2];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+ p[i].eobs = ctx->eobs_pbuf[i][2];
+ }
+
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+ for (y = 0; y < mi_height; y++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++)
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+ xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+ if (cpi->oxcf.aq_mode != NO_AQ) vp9_init_plane_quantizers(cpi, x);
+
+ if (is_inter_block(xdmi) && xdmi->sb_type < BLOCK_8X8) {
+ xdmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ xdmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
+
+ x->skip = ctx->skip;
+ memcpy(x->zcoeff_blk[xdmi->tx_size], ctx->zcoeff_blk,
+ sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+
+ if (!output_enabled) return;
+
+#if CONFIG_INTERNAL_STATS
+ if (frame_is_intra_only(cm)) {
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/, THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/, THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/,
+ THR_D153_PRED /*D153_PRED*/, THR_D207_PRED /*D207_PRED*/,
+ THR_D63_PRED /*D63_PRED*/, THR_TM /*TM_PRED*/,
+ };
+ ++cpi->mode_chosen_counts[kf_mode_index[xdmi->mode]];
+ } else {
+ // Note how often each mode chosen as best
+ ++cpi->mode_chosen_counts[ctx->best_mode_index];
+ }
+#endif
+ if (!frame_is_intra_only(cm)) {
+ if (is_inter_block(xdmi)) {
+ vp9_update_mv_count(td);
+
+ if (cm->interp_filter == SWITCHABLE) {
+ const int ctx_interp = get_pred_context_switchable_interp(xd);
+ ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
+ }
+ }
+
+ rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ rdc->filter_diff[i] += ctx->best_filter_diff[i];
+ }
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->ref_frame[0];
+ mv->ref_frame[1] = mi->ref_frame[1];
+ mv->mv[0].as_int = mi->mv[0].as_int;
+ mv->mv[1].as_int = mi->mv[1].as_int;
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+ int mi_row, int mi_col) {
+ uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
+ const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ int i;
+
+ // Set current frame pointer.
+ x->e_mbd.cur_buf = src;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
+ NULL, x->e_mbd.plane[i].subsampling_x,
+ x->e_mbd.plane[i].subsampling_y);
+}
+
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+ INTERP_FILTER interp_filter,
+ RD_COST *rd_cost, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ INTERP_FILTER filter_ref;
+
+ filter_ref = get_pred_context_switchable_interp(xd);
+ if (interp_filter == BILINEAR)
+ filter_ref = BILINEAR;
+ else if (filter_ref == SWITCHABLE_FILTERS)
+ filter_ref = EIGHTTAP;
+
+ mi->sb_type = bsize;
+ mi->mode = ZEROMV;
+ mi->tx_size =
+ VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[tx_mode]);
+ mi->skip = 1;
+ mi->uv_mode = DC_PRED;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ mi->mv[0].as_int = 0;
+ mi->interp_filter = filter_ref;
+
+ xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
+ x->skip = 1;
+
+ vp9_rd_cost_init(rd_cost);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ AQ_MODE aq_mode) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const uint8_t *const map =
+ cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
+ vp9_init_plane_quantizers(cpi, x);
+ vpx_clear_system_state();
+
+ if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) {
+ if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult;
+ } else if (aq_mode == PERCEPTUAL_AQ) {
+ x->rdmult = x->cb_rdmult;
+ } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(
+ get_segment_id(cm, map, bsize, mi_row, mi_col)))
+ x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+ } else {
+ x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+ }
+
+ if (oxcf->tuning == VP8_TUNE_SSIM) {
+ set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+}
+
+static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int rate_in_best_rd,
+ int64_t dist_in_best_rd) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+ int i, orig_rdmult;
+ int64_t best_rd = INT64_MAX;
+
+ vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
+ // Use the lower precision, but faster, 32x32 fdct for mode selection.
+ x->use_lp32x32fdct = 1;
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ mi = xd->mi[0];
+ mi->sb_type = bsize;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][0];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+ p[i].eobs = ctx->eobs_pbuf[i][0];
+ }
+ ctx->is_coded = 0;
+ ctx->skippable = 0;
+ ctx->pred_pixel_ready = 0;
+ x->skip_recode = 0;
+
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ mi->skip = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->source_variance = vp9_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ x->source_variance =
+ vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+#else
+ x->source_variance =
+ vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ orig_rdmult = x->rdmult;
+
+ if ((cpi->sf.tx_domain_thresh > 0.0) ||
+ (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
+ double logvar = vp9_log_block_var(cpi, x, bsize);
+ // Check block complexity as part of decision on using pixel or transform
+ // domain distortion in rd tests.
+ x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
+ (logvar >= cpi->sf.tx_domain_thresh);
+
+ // Store block complexity to decide on using quantized coefficient
+ // optimization inside the rd loop.
+ x->log_block_src_var = logvar;
+ } else {
+ x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
+ x->log_block_src_var = 0.0;
+ }
+
+ set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+ set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode);
+ if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) {
+ best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd,
+ dist_in_best_rd);
+ }
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+ } else {
+ if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
+ if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
+ vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
+ ctx, best_rd);
+ else
+ vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
+ vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
+ }
+ }
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+ (bsize >= BLOCK_16X16) &&
+ (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+ vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
+ // TODO(jingning) The rate-distortion optimization flow needs to be
+ // refactored to provide proper exit/return handle.
+ if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX)
+ rd_cost->rdcost = INT64_MAX;
+ else
+ rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
+ x->rdmult = orig_rdmult;
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_sb_modes_time);
+#endif
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void update_stats(VP9_COMMON *cm, ThreadData *td) {
+ const MACROBLOCK *x = &td->mb;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const BLOCK_SIZE bsize = mi->sb_type;
+
+ if (!frame_is_intra_only(cm)) {
+ FRAME_COUNTS *const counts = td->counts;
+ const int inter_block = is_inter_block(mi);
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_REF_FRAME);
+ if (!seg_ref_active) {
+ counts->intra_inter[get_intra_inter_context(xd)][inter_block]++;
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (inter_block) {
+ const MV_REFERENCE_FRAME ref0 = mi->ref_frame[0];
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
+ [has_second_ref(mi)]++;
+
+ if (has_second_ref(mi)) {
+ const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+ const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+ const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1];
+ counts->comp_ref[ctx][bit]++;
+ } else {
+ counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
+ [ref0 != LAST_FRAME]++;
+ if (ref0 != LAST_FRAME)
+ counts->single_ref[vp9_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 != GOLDEN_FRAME]++;
+ }
+ }
+ }
+ if (inter_block &&
+ !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) {
+ const int mode_ctx = mbmi_ext->mode_context[mi->ref_frame[0]];
+ if (bsize >= BLOCK_8X8) {
+ const PREDICTION_MODE mode = mi->mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+ } else {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int j = idy * 2 + idx;
+ const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+ ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+ }
+ }
+ }
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+ for (p = 0; p < MAX_MB_PLANE; p++) {
+ memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(xd->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(xd->above_seg_context + mi_col, sa,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE bsize) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ memcpy(a + num_4x4_blocks_wide * p,
+ xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(l + num_4x4_blocks_high * p,
+ xd->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(sa, xd->above_seg_context + mi_col,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+ sizeof(xd->left_seg_context[0]) * mi_height);
+}
+
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCK *const x = &td->mb;
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+
+ if (cpi->sf.enable_tpl_model &&
+ (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ x->rdmult = x->cb_rdmult;
+ if (oxcf->tuning == VP8_TUNE_SSIM) {
+ set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+ }
+ }
+
+ update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+ encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+
+ if (output_enabled) {
+ update_stats(&cpi->common, td);
+
+ (*tp)->token = EOSB_TOKEN;
+ (*tp)++;
+ }
+}
+
+static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ int ctx;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize = bsize;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (bsize >= BLOCK_8X8) {
+ ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ subsize = get_subsize(bsize, pc_tree->partitioning);
+ } else {
+ ctx = 0;
+ subsize = BLOCK_4X4;
+ }
+
+ partition = partition_lookup[bsl][subsize];
+ if (output_enabled && bsize != BLOCK_4X4)
+ td->counts->partition[ctx][partition]++;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->none);
+ break;
+ case PARTITION_VERT:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->vertical[0]);
+ if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+ encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, &pc_tree->vertical[1]);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->horizontal[0]);
+ if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+ encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, &pc_tree->horizontal[1]);
+ }
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ if (bsize == BLOCK_8X8) {
+ encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+ pc_tree->leaf_split[0]);
+ } else {
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ pc_tree->split[0]);
+ encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, pc_tree->split[1]);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, pc_tree->split[2]);
+ encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+ subsize, pc_tree->split[3]);
+ }
+ break;
+ }
+
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Check to see if the given partition size is allowed for a specified number
+// of 8x8 block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ if (rows_left <= 0 || cols_left <= 0) {
+ return VPXMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; bsize > 0; bsize -= 3) {
+ *bh = num_8x8_blocks_high_lookup[bsize];
+ *bw = num_8x8_blocks_wide_lookup[bsize];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return bsize;
+}
+
+static void set_partial_b64x64_partition(MODE_INFO *mi, int mis, int bh_in,
+ int bw_in, int row8x8_remaining,
+ int col8x8_remaining, BLOCK_SIZE bsize,
+ MODE_INFO **mi_8x8) {
+ int bh = bh_in;
+ int r, c;
+ for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+ int bw = bw_in;
+ for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
+ const int index = r * mis + c;
+ mi_8x8[index] = mi + index;
+ mi_8x8[index]->sb_type = find_partition_size(
+ bsize, row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+ }
+ }
+}
+
+// This function attempts to set all mode info entries in a given SB64
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mi_stride;
+ const int row8x8_remaining = tile->mi_row_end - mi_row;
+ const int col8x8_remaining = tile->mi_col_end - mi_col;
+ int block_row, block_col;
+ MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+ int bh = num_8x8_blocks_high_lookup[bsize];
+ int bw = num_8x8_blocks_wide_lookup[bsize];
+
+ assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+ // Apply the requested partition size to the SB64 if it is all "in image"
+ if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+ (row8x8_remaining >= MI_BLOCK_SIZE)) {
+ for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
+ for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
+ int index = block_row * mis + block_col;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->sb_type = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB64.
+ set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
+ col8x8_remaining, bsize, mi_8x8);
+ }
+}
+
+static const struct {
+ int row;
+ int col;
+} coord_lookup[16] = {
+ // 32x32 index = 0
+ { 0, 0 },
+ { 0, 2 },
+ { 2, 0 },
+ { 2, 2 },
+ // 32x32 index = 1
+ { 0, 4 },
+ { 0, 6 },
+ { 2, 4 },
+ { 2, 6 },
+ // 32x32 index = 2
+ { 4, 0 },
+ { 4, 2 },
+ { 6, 0 },
+ { 6, 2 },
+ // 32x32 index = 3
+ { 4, 4 },
+ { 4, 6 },
+ { 6, 4 },
+ { 6, 6 },
+};
+
+static void set_source_var_based_partition(VP9_COMP *cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x,
+ MODE_INFO **mi_8x8, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mi_stride;
+ const int row8x8_remaining = tile->mi_row_end - mi_row;
+ const int col8x8_remaining = tile->mi_col_end - mi_col;
+ MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+
+ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+
+ assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+
+ // In-image SB64
+ if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
+ (row8x8_remaining >= MI_BLOCK_SIZE)) {
+ int i, j;
+ int index;
+ Diff d32[4];
+ const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
+ int is_larger_better = 0;
+ int use32x32 = 0;
+ unsigned int thr = cpi->source_var_thresh;
+
+ memset(d32, 0, sizeof(d32));
+
+ for (i = 0; i < 4; i++) {
+ Diff *d16[4];
+
+ for (j = 0; j < 4; j++) {
+ int b_mi_row = coord_lookup[i * 4 + j].row;
+ int b_mi_col = coord_lookup[i * 4 + j].col;
+ int boffset = b_mi_row / 2 * cm->mb_cols + b_mi_col / 2;
+
+ d16[j] = cpi->source_diff_var + offset + boffset;
+
+ index = b_mi_row * mis + b_mi_col;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->sb_type = BLOCK_16X16;
+
+ // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
+ // size to further improve quality.
+ }
+
+ is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) &&
+ (d16[2]->var < thr) && (d16[3]->var < thr);
+
+ // Use 32x32 partition
+ if (is_larger_better) {
+ use32x32 += 1;
+
+ for (j = 0; j < 4; j++) {
+ d32[i].sse += d16[j]->sse;
+ d32[i].sum += d16[j]->sum;
+ }
+
+ d32[i].var =
+ (unsigned int)(d32[i].sse -
+ (unsigned int)(((int64_t)d32[i].sum * d32[i].sum) >>
+ 10));
+
+ index = coord_lookup[i * 4].row * mis + coord_lookup[i * 4].col;
+ mi_8x8[index] = mi_upper_left + index;
+ mi_8x8[index]->sb_type = BLOCK_32X32;
+ }
+ }
+
+ if (use32x32 == 4) {
+ thr <<= 1;
+ is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) &&
+ (d32[2].var < thr) && (d32[3].var < thr);
+
+ // Use 64x64 partition
+ if (is_larger_better) {
+ mi_8x8[0] = mi_upper_left;
+ mi_8x8[0]->sb_type = BLOCK_64X64;
+ }
+ }
+ } else { // partial in-image SB64
+ int bh = num_8x8_blocks_high_lookup[BLOCK_16X16];
+ int bw = num_8x8_blocks_wide_lookup[BLOCK_16X16];
+ set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
+ col8x8_remaining, BLOCK_16X16, mi_8x8);
+ }
+}
+
+static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ int bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ struct macroblock_plane *const p = x->plane;
+ const struct segmentation *const seg = &cm->seg;
+ const int bw = num_8x8_blocks_wide_lookup[mi->sb_type];
+ const int bh = num_8x8_blocks_high_lookup[mi->sb_type];
+ const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+
+ *(xd->mi[0]) = ctx->mic;
+ *(x->mbmi_ext) = ctx->mbmi_ext;
+
+ if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled ||
+ cpi->active_map.enabled)) {
+ // Setting segmentation map for cyclic_refresh.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->content_mode) {
+ vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
+ ctx->rate, ctx->dist, x->skip, p);
+ } else {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+ }
+ vp9_init_plane_quantizers(cpi, x);
+ }
+
+ if (is_inter_block(mi)) {
+ vp9_update_mv_count(td);
+ if (cm->interp_filter == SWITCHABLE) {
+ const int pred_ctx = get_pred_context_switchable_interp(xd);
+ ++td->counts->switchable_interp[pred_ctx][mi->interp_filter];
+ }
+
+ if (mi->sb_type < BLOCK_8X8) {
+ mi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+ mi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+ }
+ }
+
+ if (cm->use_prev_frame_mvs || !cm->error_resilient_mode ||
+ (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 &&
+ cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) {
+ MV_REF *const frame_mvs =
+ cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+ int w, h;
+
+ for (h = 0; h < y_mis; ++h) {
+ MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+ for (w = 0; w < x_mis; ++w) {
+ MV_REF *const mv = frame_mv + w;
+ mv->ref_frame[0] = mi->ref_frame[0];
+ mv->ref_frame[1] = mi->ref_frame[1];
+ mv->mv[0].as_int = mi->mv[0].as_int;
+ mv->mv[1].as_int = mi->mv[1].as_int;
+ }
+ }
+ }
+
+ x->skip = ctx->skip;
+ x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0];
+}
+
+static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
+ int mi_col, int output_enabled, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCK *const x = &td->mb;
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+ update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
+
+ encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+ update_stats(&cpi->common, td);
+
+ (*tp)->token = EOSB_TOKEN;
+ (*tp)++;
+}
+
+static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
+ const TileInfo *const tile, TOKENEXTRA **tp,
+ int mi_row, int mi_col, int output_enabled,
+ BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ int ctx;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (bsize >= BLOCK_8X8) {
+ const int idx_str = xd->mi_stride * mi_row + mi_col;
+ MODE_INFO **mi_8x8 = cm->mi_grid_visible + idx_str;
+ ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ subsize = mi_8x8[0]->sb_type;
+ } else {
+ ctx = 0;
+ subsize = BLOCK_4X4;
+ }
+
+ partition = partition_lookup[bsl][subsize];
+ if (output_enabled && bsize != BLOCK_4X4)
+ td->counts->partition[ctx][partition]++;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->none);
+ break;
+ case PARTITION_VERT:
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->vertical[0]);
+ if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, &pc_tree->vertical[1]);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ &pc_tree->horizontal[0]);
+ if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+ encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, &pc_tree->horizontal[1]);
+ }
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
+ pc_tree->split[0]);
+ encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+ subsize, pc_tree->split[1]);
+ encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+ subsize, pc_tree->split[2]);
+ encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
+ output_enabled, subsize, pc_tree->split[3]);
+ break;
+ }
+
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MODE_INFO **mi_8x8,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ int do_recon, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mis = cm->mi_stride;
+ const int bsl = b_width_log2_lookup[bsize];
+ const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
+ const int bss = (1 << bsl) / 4;
+ int i, pl;
+ PARTITION_TYPE partition = PARTITION_NONE;
+ BLOCK_SIZE subsize;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ RD_COST last_part_rdc, none_rdc, chosen_rdc;
+ BLOCK_SIZE sub_subsize = BLOCK_4X4;
+ int splits_below = 0;
+ BLOCK_SIZE bs_type = mi_8x8[0]->sb_type;
+ int do_partition_search = 1;
+ PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ assert(num_4x4_blocks_wide_lookup[bsize] ==
+ num_4x4_blocks_high_lookup[bsize]);
+
+ vp9_rd_cost_reset(&last_part_rdc);
+ vp9_rd_cost_reset(&none_rdc);
+ vp9_rd_cost_reset(&chosen_rdc);
+
+ partition = partition_lookup[bsl][bs_type];
+ subsize = get_subsize(bsize, partition);
+
+ pc_tree->partitioning = partition;
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = vp9_block_energy(cpi, x, bsize);
+ }
+
+ if (do_partition_search &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ cpi->sf.adjust_partitioning_from_last_frame) {
+ // Check if any of the sub blocks are further split.
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+ sub_subsize = get_subsize(subsize, PARTITION_SPLIT);
+ splits_below = 1;
+ for (i = 0; i < 4; i++) {
+ int jj = i >> 1, ii = i & 0x01;
+ MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+ if (this_mi && this_mi->sb_type >= sub_subsize) {
+ splits_below = 0;
+ }
+ }
+ }
+
+ // If partition is not none try none unless each of the 4 splits are split
+ // even further..
+ if (partition != PARTITION_NONE && !splits_below &&
+ mi_row + (mi_step >> 1) < cm->mi_rows &&
+ mi_col + (mi_step >> 1) < cm->mi_cols) {
+ pc_tree->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx,
+ INT_MAX, INT64_MAX);
+
+ pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+
+ if (none_rdc.rate < INT_MAX) {
+ none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+ }
+
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ mi_8x8[0]->sb_type = bs_type;
+ pc_tree->partitioning = partition;
+ }
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize,
+ ctx, INT_MAX, INT64_MAX);
+ break;
+ case PARTITION_HORZ:
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + (mi_step >> 1) < cm->mi_rows) {
+ RD_COST tmp_rdc;
+ PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+ vp9_rd_cost_init(&tmp_rdc);
+ update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
+ &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
+ INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ vp9_rd_cost_reset(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_VERT:
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + (mi_step >> 1) < cm->mi_cols) {
+ RD_COST tmp_rdc;
+ PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
+ vp9_rd_cost_init(&tmp_rdc);
+ update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
+ pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
+ rd_pick_sb_modes(
+ cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+ subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ vp9_rd_cost_reset(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ if (bsize == BLOCK_8X8) {
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
+ break;
+ }
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * (mi_step >> 1);
+ int y_idx = (i >> 1) * (mi_step >> 1);
+ int jj = i >> 1, ii = i & 0x01;
+ RD_COST tmp_rdc;
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ vp9_rd_cost_init(&tmp_rdc);
+ rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
+ tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &tmp_rdc.rate, &tmp_rdc.dist, i != 3,
+ pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ vp9_rd_cost_reset(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ }
+ break;
+ }
+
+ pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += cpi->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+ }
+
+ if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+ (mi_row + mi_step < cm->mi_rows ||
+ mi_row + (mi_step >> 1) == cm->mi_rows) &&
+ (mi_col + mi_step < cm->mi_cols ||
+ mi_col + (mi_step >> 1) == cm->mi_cols)) {
+ BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
+ chosen_rdc.rate = 0;
+ chosen_rdc.dist = 0;
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Split partition.
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * (mi_step >> 1);
+ int y_idx = (i >> 1) * (mi_step >> 1);
+ RD_COST tmp_rdc;
+
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
+ INT_MAX, INT64_MAX);
+
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ vp9_rd_cost_reset(&chosen_rdc);
+ break;
+ }
+
+ chosen_rdc.rate += tmp_rdc.rate;
+ chosen_rdc.dist += tmp_rdc.dist;
+
+ if (i != 3)
+ encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ split_subsize, pc_tree->split[i]);
+
+ pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
+ split_subsize);
+ chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ }
+ pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ if (chosen_rdc.rate < INT_MAX) {
+ chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+ }
+ }
+
+ // If last_part is better set the partitioning to that.
+ if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+ mi_8x8[0]->sb_type = bsize;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+ chosen_rdc = last_part_rdc;
+ }
+ // If none was better set the partitioning to that.
+ if (none_rdc.rdcost < chosen_rdc.rdcost) {
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ chosen_rdc = none_rdc;
+ }
+
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == BLOCK_64X64)
+ assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+ if (do_recon) {
+ int output_enabled = (bsize == BLOCK_64X64);
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
+ pc_tree);
+ }
+
+ *rate = chosen_rdc.rate;
+ *dist = chosen_rdc.dist;
+}
+
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+ BLOCK_4X4, BLOCK_8X8, BLOCK_8X8, BLOCK_8X8, BLOCK_16X16,
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
+ BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
+ BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+};
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a 64x64 SB but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one sb64.
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size,
+ int bs_hist[BLOCK_SIZES]) {
+ int sb_width_in_blocks = MI_BLOCK_SIZE;
+ int sb_height_in_blocks = MI_BLOCK_SIZE;
+ int i, j;
+ int index = 0;
+
+ // Check the sb_type for each block that belongs to this region.
+ for (i = 0; i < sb_height_in_blocks; ++i) {
+ for (j = 0; j < sb_width_in_blocks; ++j) {
+ MODE_INFO *mi = mi_8x8[index + j];
+ BLOCK_SIZE sb_type = mi ? mi->sb_type : 0;
+ bs_hist[sb_type]++;
+ *min_block_size = VPXMIN(*min_block_size, sb_type);
+ *max_block_size = VPXMAX(*max_block_size, sb_type);
+ }
+ index += xd->mi_stride;
+ }
+}
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
+ BLOCK_32X32, BLOCK_32X32, BLOCK_64X64
+};
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
+ VP9_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi = xd->mi;
+ const int left_in_image = !!xd->left_mi;
+ const int above_in_image = !!xd->above_mi;
+ const int row8x8_remaining = tile->mi_row_end - mi_row;
+ const int col8x8_remaining = tile->mi_col_end - mi_col;
+ int bh, bw;
+ BLOCK_SIZE min_size = BLOCK_4X4;
+ BLOCK_SIZE max_size = BLOCK_64X64;
+ int bs_hist[BLOCK_SIZES] = { 0 };
+
+ // Trap case where we do not have a prediction.
+ if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+ // Default "min to max" and "max to min"
+ min_size = BLOCK_64X64;
+ max_size = BLOCK_4X4;
+
+ // NOTE: each call to get_sb_partition_size_range() uses the previous
+ // passed in values for min and max as a starting point.
+ // Find the min and max partition used in previous frame at this location
+ if (cm->frame_type != KEY_FRAME) {
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+ get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+ }
+ // Find the min and max partition sizes used in the left SB64
+ if (left_in_image) {
+ MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
+ get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
+ bs_hist);
+ }
+ // Find the min and max partition sizes used in the above SB64.
+ if (above_in_image) {
+ MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
+ get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
+ bs_hist);
+ }
+
+ // Adjust observed min and max for "relaxed" auto partition case.
+ if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+ }
+
+ // Check border cases where max and min from neighbors may not be legal.
+ max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
+ &bh, &bw);
+ // Test for blocks at the edge of the active image.
+ // This may be the actual edge of the image or where there are formatting
+ // bars.
+ if (vp9_active_edge_sb(cpi, mi_row, mi_col)) {
+ min_size = BLOCK_4X4;
+ } else {
+ min_size =
+ VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
+ }
+
+ // When use_square_partition_only is true, make sure at least one square
+ // partition is allowed by selecting the next smaller square size as
+ // *min_block_size.
+ if (cpi->sf.use_square_partition_only &&
+ next_square_size[max_size] < min_size) {
+ min_size = next_square_size[max_size];
+ }
+
+ *min_block_size = min_size;
+ *max_block_size = max_size;
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ MODE_INFO *mi;
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
+ BLOCK_SIZE bs, min_size, max_size;
+
+ min_size = BLOCK_64X64;
+ max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = prev_mi[idy * cm->mi_stride + idx];
+ bs = mi ? mi->sb_type : bsize;
+ min_size = VPXMIN(min_size, bs);
+ max_size = VPXMAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ mi = xd->mi[idy * cm->mi_stride - 1];
+ bs = mi ? mi->sb_type : bsize;
+ min_size = VPXMIN(min_size, bs);
+ max_size = VPXMAX(max_size, bs);
+ }
+ }
+
+ if (xd->above_mi) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = xd->mi[idx - cm->mi_stride];
+ bs = mi ? mi->sb_type : bsize;
+ min_size = VPXMIN(min_size, bs);
+ max_size = VPXMAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = min_size;
+ *max_bs = max_size;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+static void nn_predict(const float *features, const NN_CONFIG *nn_config,
+ float *output) {
+ int num_input_nodes = nn_config->num_inputs;
+ int buf_index = 0;
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ const float *input_nodes = features;
+
+ // Propagate hidden layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ int layer, node, i;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (layer = 0; layer < num_layers; ++layer) {
+ const float *weights = nn_config->weights[layer];
+ const float *bias = nn_config->bias[layer];
+ float *output_nodes = buf[buf_index];
+ const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+ assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+ for (node = 0; node < num_output_nodes; ++node) {
+ float val = 0.0f;
+ for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+ val += bias[node];
+ // ReLU as activation function.
+ val = VPXMAX(val, 0.0f);
+ output_nodes[node] = val;
+ weights += num_input_nodes;
+ }
+ num_input_nodes = num_output_nodes;
+ input_nodes = output_nodes;
+ buf_index = 1 - buf_index;
+ }
+
+ // Final output layer.
+ {
+ const float *weights = nn_config->weights[num_layers];
+ for (node = 0; node < nn_config->num_outputs; ++node) {
+ const float *bias = nn_config->bias[num_layers];
+ float val = 0.0f;
+ for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+ output[node] = val + bias[node];
+ weights += num_input_nodes;
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+#define FEATURES 7
+// Machine-learning based partition search early termination.
+// Return 1 to skip split and rect partitions.
+static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int mag_mv =
+ abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+ const int left_in_image = !!xd->left_mi;
+ const int above_in_image = !!xd->above_mi;
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+ int above_par = 0; // above_partitioning
+ int left_par = 0; // left_partitioning
+ int last_par = 0; // last_partitioning
+ int offset = 0;
+ int i;
+ BLOCK_SIZE context_size;
+ const NN_CONFIG *nn_config = NULL;
+ const float *mean, *sd, *linear_weights;
+ float nn_score, linear_score;
+ float features[FEATURES];
+
+ assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+ vpx_clear_system_state();
+
+ switch (bsize) {
+ case BLOCK_64X64:
+ offset = 0;
+ nn_config = &vp9_partition_nnconfig_64x64;
+ break;
+ case BLOCK_32X32:
+ offset = 8;
+ nn_config = &vp9_partition_nnconfig_32x32;
+ break;
+ case BLOCK_16X16:
+ offset = 16;
+ nn_config = &vp9_partition_nnconfig_16x16;
+ break;
+ default: assert(0 && "Unexpected block size."); return 0;
+ }
+
+ if (above_in_image) {
+ context_size = xd->above_mi->sb_type;
+ if (context_size < bsize)
+ above_par = 2;
+ else if (context_size == bsize)
+ above_par = 1;
+ }
+
+ if (left_in_image) {
+ context_size = xd->left_mi->sb_type;
+ if (context_size < bsize)
+ left_par = 2;
+ else if (context_size == bsize)
+ left_par = 1;
+ }
+
+ if (prev_mi) {
+ context_size = prev_mi[0]->sb_type;
+ if (context_size < bsize)
+ last_par = 2;
+ else if (context_size == bsize)
+ last_par = 1;
+ }
+
+ mean = &vp9_partition_feature_mean[offset];
+ sd = &vp9_partition_feature_std[offset];
+ features[0] = ((float)ctx->rate - mean[0]) / sd[0];
+ features[1] = ((float)ctx->dist - mean[1]) / sd[1];
+ features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2];
+ features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3];
+ features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4];
+ features[5] = ((float)cm->base_qindex - mean[5]) * sd[5];
+ features[6] = ((float)last_par - mean[6]) * sd[6];
+
+ // Predict using linear model.
+ linear_weights = &vp9_partition_linear_weights[offset];
+ linear_score = linear_weights[FEATURES];
+ for (i = 0; i < FEATURES; ++i)
+ linear_score += linear_weights[i] * features[i];
+ if (linear_score > 0.1f) return 0;
+
+ // Predict using neural net model.
+ nn_predict(features, nn_config, &nn_score);
+
+ if (linear_score < -0.0f && nn_score < 0.1f) return 1;
+ if (nn_score < -0.0f && linear_score < 0.1f) return 1;
+ return 0;
+}
+#undef FEATURES
+
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *const x,
+ const RD_COST *const rd_cost) {
+ DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
+ const VP9_COMMON *const cm = &cpi->common;
+ float features[FEATURES];
+ const float *linear_weights = NULL; // Linear model weights.
+ float linear_score = 0.0f;
+ const int qindex = cm->base_qindex;
+ const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2);
+ const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720;
+ const int resolution_ctx = is_720p_or_larger ? 1 : 0;
+
+ switch (bsize) {
+ case BLOCK_64X64:
+ linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx];
+ break;
+ case BLOCK_32X32:
+ linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx];
+ break;
+ case BLOCK_16X16:
+ linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx];
+ break;
+ case BLOCK_8X8:
+ linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx];
+ break;
+ default: assert(0 && "Unexpected block size."); return 0;
+ }
+ if (!linear_weights) return 0;
+
+ { // Generate feature values.
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int ac_q =
+ vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+ const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ int feature_index = 0;
+ unsigned int var, sse;
+ float rate_f, dist_f;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var =
+ vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd);
+ } else {
+ var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+ }
+#else
+ var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ vp9_64_zeros, 0, &sse);
+#endif
+ var = var >> num_pels_log2;
+
+ vpx_clear_system_state();
+
+ rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX);
+ dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2);
+ rate_f =
+ ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+
+ features[feature_index++] = rate_f;
+ features[feature_index++] = dist_f;
+ features[feature_index++] = (float)var;
+ features[feature_index++] = (float)ac_q;
+ assert(feature_index == FEATURES);
+ }
+
+ { // Calculate the output score.
+ int i;
+ linear_score = linear_weights[FEATURES];
+ for (i = 0; i < FEATURES; ++i)
+ linear_score += linear_weights[i] * features[i];
+ }
+
+ return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx];
+}
+#undef FEATURES
+
+#define FEATURES 8
+#define LABELS 4
+static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize,
+ const PC_TREE *const pc_tree,
+ int *allow_horz, int *allow_vert,
+ int64_t ref_rd) {
+ const NN_CONFIG *nn_config = NULL;
+ float score[LABELS] = {
+ 0.0f,
+ };
+ int thresh = -1;
+ int i;
+ (void)x;
+
+ if (ref_rd <= 0 || ref_rd > 1000000000) return;
+
+ switch (bsize) {
+ case BLOCK_8X8: break;
+ case BLOCK_16X16:
+ nn_config = &vp9_rect_part_nnconfig_16;
+ thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &vp9_rect_part_nnconfig_32;
+ thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &vp9_rect_part_nnconfig_64;
+ thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3];
+ break;
+ default: assert(0 && "Unexpected block size."); return;
+ }
+ if (!nn_config || thresh < 0) return;
+
+ // Feature extraction and model score calculation.
+ {
+ const VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int dc_q =
+ vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+ const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+ int feature_index = 0;
+ float features[FEATURES];
+
+ features[feature_index++] = logf((float)dc_q + 1.0f);
+ features[feature_index++] =
+ (float)(pc_tree->partitioning == PARTITION_NONE);
+ features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f);
+
+ {
+ const float norm_factor = 1.0f / ((float)ref_rd + 1.0f);
+ const int64_t none_rdcost = pc_tree->none.rdcost;
+ float rd_ratio = 2.0f;
+ if (none_rdcost > 0 && none_rdcost < 1000000000)
+ rd_ratio = (float)none_rdcost * norm_factor;
+ features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
+
+ for (i = 0; i < 4; ++i) {
+ const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+ const int rd_valid = this_rd > 0 && this_rd < 1000000000;
+ // Ratio between sub-block RD and whole block RD.
+ features[feature_index++] =
+ rd_valid ? (float)this_rd * norm_factor : 1.0f;
+ }
+ }
+
+ assert(feature_index == FEATURES);
+ nn_predict(features, nn_config, score);
+ }
+
+ // Make decisions based on the model score.
+ {
+ int max_score = -1000;
+ int horz = 0, vert = 0;
+ int int_score[LABELS];
+ for (i = 0; i < LABELS; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = VPXMAX(int_score[i], max_score);
+ }
+ thresh = max_score - thresh;
+ for (i = 0; i < LABELS; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) horz = 1;
+ if ((i >> 1) & 1) vert = 1;
+ }
+ }
+ *allow_horz = *allow_horz && horz;
+ *allow_vert = *allow_vert && vert;
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+// Perform fast and coarse motion search for the given block. This is a
+// pre-processing step for the ML based partition search speedup.
+static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ MV ref_mv, MV_REFERENCE_FRAME ref,
+ uint8_t *const pred_buf) {
+ const VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ YV12_BUFFER_CONFIG *yv12;
+ YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
+ const int step_param = 1;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ const SEARCH_METHODS search_method = NSTEP;
+ const int sadpb = x->sadperbit16;
+ MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
+ MV best_mv = { 0, 0 };
+ int cost_list[5];
+
+ if (scaled_ref_frame)
+ yv12 = scaled_ref_frame;
+ else
+ yv12 = get_ref_frame_buffer(cpi, ref);
+
+ assert(yv12 != NULL);
+ if (!yv12) return;
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[ref - 1].sf);
+ mi->ref_frame[0] = ref;
+ mi->ref_frame[1] = NONE;
+ mi->sb_type = bsize;
+ vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+ vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv,
+ &best_mv, 0, 0);
+ best_mv.row *= 8;
+ best_mv.col *= 8;
+ x->mv_limits = tmp_mv_limits;
+ mi->mv[0].as_mv = best_mv;
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ xd->plane[0].dst.buf = pred_buf;
+ xd->plane[0].dst.stride = 64;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+}
+
+// Use a neural net model to prune partition-none and partition-split search.
+// Features used: QP; spatial block size contexts; variance of prediction
+// residue after simple_motion_search.
+#define FEATURES 12
+static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
+ MACROBLOCK *const x,
+ PC_TREE *const pc_tree,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int *none, int *split) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const NN_CONFIG *nn_config = NULL;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
+ uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? (CONVERT_TO_BYTEPTR(pred_buffer))
+ : pred_buffer;
+#else
+ DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]);
+ uint8_t *const pred_buf = pred_buffer;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const int speed = cpi->oxcf.speed;
+ float thresh = 0.0f;
+
+ switch (bsize) {
+ case BLOCK_64X64:
+ nn_config = &vp9_part_split_nnconfig_64;
+ thresh = speed > 0 ? 2.8f : 3.0f;
+ break;
+ case BLOCK_32X32:
+ nn_config = &vp9_part_split_nnconfig_32;
+ thresh = speed > 0 ? 3.5f : 3.0f;
+ break;
+ case BLOCK_16X16:
+ nn_config = &vp9_part_split_nnconfig_16;
+ thresh = speed > 0 ? 3.8f : 4.0f;
+ break;
+ case BLOCK_8X8:
+ nn_config = &vp9_part_split_nnconfig_8;
+ if (cm->width >= 720 && cm->height >= 720)
+ thresh = speed > 0 ? 2.5f : 2.0f;
+ else
+ thresh = speed > 0 ? 3.8f : 2.0f;
+ break;
+ default: assert(0 && "Unexpected block size."); return;
+ }
+
+ if (!nn_config) return;
+
+ // Do a simple single motion search to find a prediction for current block.
+ // The variance of the residue will be used as input features.
+ {
+ MV ref_mv;
+ const MV_REFERENCE_FRAME ref =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+ // If bsize is 64x64, use zero MV as reference; otherwise, use MV result
+ // of previous(larger) block as reference.
+ if (bsize == BLOCK_64X64)
+ ref_mv.row = ref_mv.col = 0;
+ else
+ ref_mv = pc_tree->mv;
+ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+ simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf);
+ pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv;
+ }
+
+ vpx_clear_system_state();
+
+ {
+ float features[FEATURES] = { 0.0f };
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int dc_q =
+ vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8);
+#else
+ const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int feature_idx = 0;
+ float score;
+
+ // Generate model input features.
+ features[feature_idx++] = logf((float)dc_q + 1.0f);
+
+ // Get the variance of the residue as input features.
+ {
+ const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ const uint8_t *pred = pred_buf;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ // Variance of whole block.
+ const unsigned int var =
+ cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+ const int has_above = !!xd->above_mi;
+ const int has_left = !!xd->left_mi;
+ const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
+ const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize;
+ int i;
+
+ features[feature_idx++] = (float)has_above;
+ features[feature_idx++] = (float)b_width_log2_lookup[above_bsize];
+ features[feature_idx++] = (float)b_height_log2_lookup[above_bsize];
+ features[feature_idx++] = (float)has_left;
+ features[feature_idx++] = (float)b_width_log2_lookup[left_bsize];
+ features[feature_idx++] = (float)b_height_log2_lookup[left_bsize];
+ features[feature_idx++] = logf((float)var + 1.0f);
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx++] = var_ratio;
+ }
+ }
+ assert(feature_idx == FEATURES);
+
+ // Feed the features into the model to get the confidence score.
+ nn_predict(features, nn_config, &score);
+
+ // Higher score means that the model has higher confidence that the split
+ // partition is better than the non-split partition. So if the score is
+ // high enough, we skip the none-split partition search; if the score is
+ // low enough, we skip the split partition search.
+ if (score > thresh) *none = 0;
+ if (score < -thresh) *split = 0;
+ }
+}
+#undef FEATURES
+#endif // !CONFIG_REALTIME_ONLY
+
+static double log_wiener_var(int64_t wiener_variance) {
+ return log(1.0 + wiener_variance) / log(2.0);
+}
+
+static void build_kmeans_segmentation(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ BLOCK_SIZE bsize = BLOCK_64X64;
+ KMEANS_DATA *kmeans_data;
+
+ vp9_disable_segmentation(&cm->seg);
+ if (cm->show_frame) {
+ int mi_row, mi_col;
+ cpi->kmeans_data_size = 0;
+ cpi->kmeans_ctr_num = 8;
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+ int mb_row_start = mi_row >> 1;
+ int mb_col_start = mi_col >> 1;
+ int mb_row_end = VPXMIN(
+ (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+ int mb_col_end = VPXMIN(
+ (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+ int row, col;
+ int64_t wiener_variance = 0;
+
+ for (row = mb_row_start; row < mb_row_end; ++row)
+ for (col = mb_col_start; col < mb_col_end; ++col)
+ wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col];
+
+ wiener_variance /=
+ (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&cpi->kmeans_mutex);
+#endif // CONFIG_MULTITHREAD
+
+ kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++];
+ kmeans_data->value = log_wiener_var(wiener_variance);
+ kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&cpi->kmeans_mutex);
+#endif // CONFIG_MULTITHREAD
+ }
+ }
+
+ vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls,
+ cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr,
+ cpi->kmeans_data_size);
+
+ vp9_perceptual_aq_mode_setup(cpi, &cm->seg);
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *cm = &cpi->common;
+ int mb_row_start = mi_row >> 1;
+ int mb_col_start = mi_col >> 1;
+ int mb_row_end =
+ VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+ int mb_col_end =
+ VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+ int row, col, idx;
+ int64_t wiener_variance = 0;
+ int segment_id;
+ int8_t seg_hist[MAX_SEGMENTS] = { 0 };
+ int8_t max_count = 0, max_index = -1;
+
+ vpx_clear_system_state();
+
+ assert(cpi->norm_wiener_variance > 0);
+
+ for (row = mb_row_start; row < mb_row_end; ++row) {
+ for (col = mb_col_start; col < mb_col_end; ++col) {
+ wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col];
+ segment_id =
+ vp9_get_group_idx(log_wiener_var(wiener_variance),
+ cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num);
+ ++seg_hist[segment_id];
+ }
+ }
+
+ for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) {
+ if (seg_hist[idx] > max_count) {
+ max_count = seg_hist[idx];
+ max_index = idx;
+ }
+ }
+
+ assert(max_index >= 0);
+ segment_id = max_index;
+
+ return segment_id;
+}
+
+static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int orig_rdmult) {
+ const int gf_group_index = cpi->twopass.gf_group.index;
+ int64_t intra_cost = 0;
+ int64_t mc_dep_cost = 0;
+ int mi_wide = num_8x8_blocks_wide_lookup[bsize];
+ int mi_high = num_8x8_blocks_high_lookup[bsize];
+ int row, col;
+
+ int dr = 0;
+ double r0, rk, beta;
+
+ TplDepFrame *tpl_frame;
+ TplDepStats *tpl_stats;
+ int tpl_stride;
+
+ if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult;
+ tpl_frame = &cpi->tpl_stats[gf_group_index];
+
+ if (tpl_frame->is_valid == 0) return orig_rdmult;
+ tpl_stats = tpl_frame->tpl_stats_ptr;
+ tpl_stride = tpl_frame->stride;
+
+ if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult;
+
+ for (row = mi_row; row < mi_row + mi_high; ++row) {
+ for (col = mi_col; col < mi_col + mi_wide; ++col) {
+ TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+
+ if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue;
+
+ intra_cost += this_stats->intra_cost;
+ mc_dep_cost += this_stats->mc_dep_cost;
+ }
+ }
+
+ vpx_clear_system_state();
+
+ r0 = cpi->rd.r0;
+ rk = (double)intra_cost / mc_dep_cost;
+ beta = r0 / rk;
+ dr = vp9_get_adaptive_rdmult(cpi, beta);
+
+ dr = VPXMIN(dr, orig_rdmult * 3 / 2);
+ dr = VPXMAX(dr, orig_rdmult * 1 / 2);
+
+ dr = VPXMAX(1, dr);
+
+ return dr;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_RATE_CTRL
+static void assign_partition_info(
+ const int row_start_4x4, const int col_start_4x4, const int block_width_4x4,
+ const int block_height_4x4, const int num_unit_rows,
+ const int num_unit_cols, PARTITION_INFO *partition_info) {
+ int i, j;
+ for (i = 0; i < block_height_4x4; ++i) {
+ for (j = 0; j < block_width_4x4; ++j) {
+ const int row_4x4 = row_start_4x4 + i;
+ const int col_4x4 = col_start_4x4 + j;
+ const int unit_index = row_4x4 * num_unit_cols + col_4x4;
+ if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
+ partition_info[unit_index].row = row_4x4 << 2;
+ partition_info[unit_index].column = col_4x4 << 2;
+ partition_info[unit_index].row_start = row_start_4x4 << 2;
+ partition_info[unit_index].column_start = col_start_4x4 << 2;
+ partition_info[unit_index].width = block_width_4x4 << 2;
+ partition_info[unit_index].height = block_height_4x4 << 2;
+ }
+ }
+}
+
+static void assign_motion_vector_info(const int block_width_4x4,
+ const int block_height_4x4,
+ const int row_start_4x4,
+ const int col_start_4x4,
+ const int num_unit_rows,
+ const int num_unit_cols, MV *source_mv[2],
+ MV_REFERENCE_FRAME source_ref_frame[2],
+ MOTION_VECTOR_INFO *motion_vector_info) {
+ int i, j;
+ for (i = 0; i < block_height_4x4; ++i) {
+ for (j = 0; j < block_width_4x4; ++j) {
+ const int row_4x4 = row_start_4x4 + i;
+ const int col_4x4 = col_start_4x4 + j;
+ const int unit_index = row_4x4 * num_unit_cols + col_4x4;
+ if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
+ if (source_ref_frame[1] == NONE) {
+ assert(source_mv[1]->row == 0 && source_mv[1]->col == 0);
+ }
+ motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0];
+ motion_vector_info[unit_index].ref_frame[1] = source_ref_frame[1];
+ motion_vector_info[unit_index].mv[0].as_mv.row = source_mv[0]->row;
+ motion_vector_info[unit_index].mv[0].as_mv.col = source_mv[0]->col;
+ motion_vector_info[unit_index].mv[1].as_mv.row = source_mv[1]->row;
+ motion_vector_info[unit_index].mv[1].as_mv.col = source_mv[1]->col;
+ }
+ }
+}
+
+static void store_superblock_info(
+ const PC_TREE *const pc_tree, MODE_INFO **mi_grid_visible,
+ const int mi_stride, const int square_size_4x4, const int num_unit_rows,
+ const int num_unit_cols, const int row_start_4x4, const int col_start_4x4,
+ PARTITION_INFO *partition_info, MOTION_VECTOR_INFO *motion_vector_info) {
+ const int subblock_square_size_4x4 = square_size_4x4 >> 1;
+ if (row_start_4x4 >= num_unit_rows || col_start_4x4 >= num_unit_cols) return;
+ assert(pc_tree->partitioning != PARTITION_INVALID);
+ // End node, no split.
+ if (pc_tree->partitioning == PARTITION_NONE ||
+ pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_VERT || square_size_4x4 == 1) {
+ const int mi_row = row_start_4x4 >> 1;
+ const int mi_col = col_start_4x4 >> 1;
+ const int mi_idx = mi_stride * mi_row + mi_col;
+ MODE_INFO **mi = mi_grid_visible + mi_idx;
+ MV *source_mv[2];
+ MV_REFERENCE_FRAME source_ref_frame[2];
+
+ // partition info
+ const int block_width_4x4 = (pc_tree->partitioning == PARTITION_VERT)
+ ? square_size_4x4 >> 1
+ : square_size_4x4;
+ const int block_height_4x4 = (pc_tree->partitioning == PARTITION_HORZ)
+ ? square_size_4x4 >> 1
+ : square_size_4x4;
+ assign_partition_info(row_start_4x4, col_start_4x4, block_width_4x4,
+ block_height_4x4, num_unit_rows, num_unit_cols,
+ partition_info);
+ if (pc_tree->partitioning == PARTITION_VERT) {
+ assign_partition_info(row_start_4x4, col_start_4x4 + block_width_4x4,
+ block_width_4x4, block_height_4x4, num_unit_rows,
+ num_unit_cols, partition_info);
+ } else if (pc_tree->partitioning == PARTITION_HORZ) {
+ assign_partition_info(row_start_4x4 + block_height_4x4, col_start_4x4,
+ block_width_4x4, block_height_4x4, num_unit_rows,
+ num_unit_cols, partition_info);
+ }
+
+ // motion vector info
+ if (pc_tree->partitioning == PARTITION_HORZ) {
+ int is_valid_second_rectangle = 0;
+ assert(square_size_4x4 > 1);
+ // First rectangle.
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->mv[0].as_mv;
+ source_mv[1] = &mi[0]->mv[1].as_mv;
+ assign_motion_vector_info(block_width_4x4, block_height_4x4,
+ row_start_4x4, col_start_4x4, num_unit_rows,
+ num_unit_cols, source_mv, source_ref_frame,
+ motion_vector_info);
+ // Second rectangle.
+ if (square_size_4x4 == 2) {
+ is_valid_second_rectangle = 1;
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->bmi[2].as_mv[0].as_mv;
+ source_mv[1] = &mi[0]->bmi[2].as_mv[1].as_mv;
+ } else {
+ const int mi_row_2 = mi_row + (block_height_4x4 >> 1);
+ const int mi_col_2 = mi_col;
+ if (mi_row_2 * 2 < num_unit_rows && mi_col_2 * 2 < num_unit_cols) {
+ const int mi_idx_2 = mi_stride * mi_row_2 + mi_col_2;
+ is_valid_second_rectangle = 1;
+ mi = mi_grid_visible + mi_idx_2;
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->mv[0].as_mv;
+ source_mv[1] = &mi[0]->mv[1].as_mv;
+ }
+ }
+ if (is_valid_second_rectangle) {
+ assign_motion_vector_info(
+ block_width_4x4, block_height_4x4, row_start_4x4 + block_height_4x4,
+ col_start_4x4, num_unit_rows, num_unit_cols, source_mv,
+ source_ref_frame, motion_vector_info);
+ }
+ } else if (pc_tree->partitioning == PARTITION_VERT) {
+ int is_valid_second_rectangle = 0;
+ assert(square_size_4x4 > 1);
+ // First rectangle.
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->mv[0].as_mv;
+ source_mv[1] = &mi[0]->mv[1].as_mv;
+ assign_motion_vector_info(block_width_4x4, block_height_4x4,
+ row_start_4x4, col_start_4x4, num_unit_rows,
+ num_unit_cols, source_mv, source_ref_frame,
+ motion_vector_info);
+ // Second rectangle.
+ if (square_size_4x4 == 2) {
+ is_valid_second_rectangle = 1;
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->bmi[1].as_mv[0].as_mv;
+ source_mv[1] = &mi[0]->bmi[1].as_mv[1].as_mv;
+ } else {
+ const int mi_row_2 = mi_row;
+ const int mi_col_2 = mi_col + (block_width_4x4 >> 1);
+ if (mi_row_2 * 2 < num_unit_rows && mi_col_2 * 2 < num_unit_cols) {
+ const int mi_idx_2 = mi_stride * mi_row_2 + mi_col_2;
+ is_valid_second_rectangle = 1;
+ mi = mi_grid_visible + mi_idx_2;
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ source_mv[0] = &mi[0]->mv[0].as_mv;
+ source_mv[1] = &mi[0]->mv[1].as_mv;
+ }
+ }
+ if (is_valid_second_rectangle) {
+ assign_motion_vector_info(
+ block_width_4x4, block_height_4x4, row_start_4x4,
+ col_start_4x4 + block_width_4x4, num_unit_rows, num_unit_cols,
+ source_mv, source_ref_frame, motion_vector_info);
+ }
+ } else {
+ assert(pc_tree->partitioning == PARTITION_NONE || square_size_4x4 == 1);
+ source_ref_frame[0] = mi[0]->ref_frame[0];
+ source_ref_frame[1] = mi[0]->ref_frame[1];
+ if (square_size_4x4 == 1) {
+ const int sub8x8_row = row_start_4x4 % 2;
+ const int sub8x8_col = col_start_4x4 % 2;
+ const int sub8x8_idx = sub8x8_row * 2 + sub8x8_col;
+ source_mv[0] = &mi[0]->bmi[sub8x8_idx].as_mv[0].as_mv;
+ source_mv[1] = &mi[0]->bmi[sub8x8_idx].as_mv[1].as_mv;
+ } else {
+ source_mv[0] = &mi[0]->mv[0].as_mv;
+ source_mv[1] = &mi[0]->mv[1].as_mv;
+ }
+ assign_motion_vector_info(block_width_4x4, block_height_4x4,
+ row_start_4x4, col_start_4x4, num_unit_rows,
+ num_unit_cols, source_mv, source_ref_frame,
+ motion_vector_info);
+ }
+
+ return;
+ }
+ // recursively traverse partition tree when partition is split.
+ assert(pc_tree->partitioning == PARTITION_SPLIT);
+ store_superblock_info(pc_tree->split[0], mi_grid_visible, mi_stride,
+ subblock_square_size_4x4, num_unit_rows, num_unit_cols,
+ row_start_4x4, col_start_4x4, partition_info,
+ motion_vector_info);
+ store_superblock_info(pc_tree->split[1], mi_grid_visible, mi_stride,
+ subblock_square_size_4x4, num_unit_rows, num_unit_cols,
+ row_start_4x4, col_start_4x4 + subblock_square_size_4x4,
+ partition_info, motion_vector_info);
+ store_superblock_info(pc_tree->split[2], mi_grid_visible, mi_stride,
+ subblock_square_size_4x4, num_unit_rows, num_unit_cols,
+ row_start_4x4 + subblock_square_size_4x4, col_start_4x4,
+ partition_info, motion_vector_info);
+ store_superblock_info(pc_tree->split[3], mi_grid_visible, mi_stride,
+ subblock_square_size_4x4, num_unit_rows, num_unit_cols,
+ row_start_4x4 + subblock_square_size_4x4,
+ col_start_4x4 + subblock_square_size_4x4,
+ partition_info, motion_vector_info);
+}
+#endif // CONFIG_RATE_CTRL
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_COST *rd_cost, RD_COST best_rdc,
+ PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ TOKENEXTRA *tp_orig = *tp;
+ PICK_MODE_CONTEXT *const ctx = &pc_tree->none;
+ int i;
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ BLOCK_SIZE subsize;
+ RD_COST this_rdc, sum_rdc;
+ int do_split = bsize >= BLOCK_8X8;
+ int do_rect = 1;
+ INTERP_FILTER pred_interp_filter;
+
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+ const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+ const int xss = x->e_mbd.plane[1].subsampling_x;
+ const int yss = x->e_mbd.plane[1].subsampling_y;
+
+ BLOCK_SIZE min_size = x->min_partition_size;
+ BLOCK_SIZE max_size = x->max_partition_size;
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+ int partition_horz_allowed =
+ !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
+ int partition_vert_allowed =
+ !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
+
+ int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
+ int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
+ int must_split = 0;
+ int should_encode_sb = 0;
+
+ // Ref frames picked in the [i_th] quarter subblock during square partition
+ // RD search. It may be used to prune ref frame selection of rect partitions.
+ uint8_t ref_frames_used[4] = { 0, 0, 0, 0 };
+
+ int partition_mul = x->cb_rdmult;
+
+ (void)*tp_orig;
+
+ assert(num_8x8_blocks_wide_lookup[bsize] ==
+ num_8x8_blocks_high_lookup[bsize]);
+
+ dist_breakout_thr >>=
+ 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+ rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+ vp9_rd_cost_init(&this_rdc);
+ vp9_rd_cost_init(&sum_rdc);
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (oxcf->tuning == VP8_TUNE_SSIM) {
+ set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul);
+ }
+ vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc);
+
+ if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ &&
+ cpi->oxcf.aq_mode != LOOKAHEAD_AQ)
+ x->mb_energy = vp9_block_energy(cpi, x, bsize);
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ int cb_partition_search_ctrl =
+ ((pc_tree->index == 0 || pc_tree->index == 3) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+ }
+
+ // Get sub block energy range
+ if (bsize >= BLOCK_16X16) {
+ int min_energy, max_energy;
+ vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+ &max_energy);
+ must_split = (min_energy < -3) && (max_energy - min_energy > 2);
+ }
+
+ // Determine partition types in search according to the speed features.
+ // The threshold set here has to be of square block size.
+ if (cpi->sf.auto_min_max_partition_size) {
+ partition_none_allowed &= (bsize <= max_size);
+ partition_horz_allowed &=
+ ((bsize <= max_size && bsize > min_size) || force_horz_split);
+ partition_vert_allowed &=
+ ((bsize <= max_size && bsize > min_size) || force_vert_split);
+ do_split &= bsize > min_size;
+ }
+
+ if (cpi->sf.use_square_partition_only &&
+ (bsize > cpi->sf.use_square_only_thresh_high ||
+ bsize < cpi->sf.use_square_only_thresh_low)) {
+ if (cpi->use_svc) {
+ if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+ partition_horz_allowed &= force_horz_split;
+ if (!vp9_active_v_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
+ partition_vert_allowed &= force_vert_split;
+ } else {
+ partition_horz_allowed &= force_horz_split;
+ partition_vert_allowed &= force_vert_split;
+ }
+ }
+
+ save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+ pc_tree->partitioning = PARTITION_NONE;
+
+ if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
+ const int do_rd_ml_partition_var_pruning =
+ partition_none_allowed && do_split &&
+ mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
+ mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
+ if (do_rd_ml_partition_var_pruning) {
+ ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+ &partition_none_allowed, &do_split);
+ } else {
+ vp9_zero(pc_tree->mv);
+ }
+ if (bsize > BLOCK_8X8) { // Store MV result as reference for subblocks.
+ for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+ }
+ }
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,
+ best_rdc.rate, best_rdc.dist);
+ ctx->rdcost = this_rdc.rdcost;
+ if (this_rdc.rate != INT_MAX) {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int ref1 = ctx->mic.ref_frame[0];
+ const int ref2 = ctx->mic.ref_frame[1];
+ for (i = 0; i < 4; ++i) {
+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
+ }
+ if (bsize >= BLOCK_8X8) {
+ this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+ }
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ MODE_INFO *mi = xd->mi[0];
+
+ best_rdc = this_rdc;
+ should_encode_sb = 1;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+
+ if (cpi->sf.rd_ml_partition.search_early_termination) {
+ // Currently, the machine-learning based partition search early
+ // termination is only used while bsize is 16x16, 32x32 or 64x64,
+ // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
+ if (!x->e_mbd.lossless &&
+ !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
+ ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
+ if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+
+ if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) {
+ const int use_ml_based_breakout =
+ cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100;
+ if (use_ml_based_breakout) {
+ if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ } else {
+ if (!cpi->sf.rd_ml_partition.search_early_termination) {
+ if ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+ (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr)) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+ }
+ }
+ }
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ } else {
+ vp9_zero(ctx->pred_mv);
+ ctx->mic.interp_filter = EIGHTTAP;
+ }
+
+ // store estimated motion vector
+ store_pred_mv(x, ctx);
+
+ // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an
+ // intra block and used for context purposes.
+ if (ctx->mic.interp_filter == SWITCHABLE_FILTERS) {
+ pred_interp_filter = EIGHTTAP;
+ } else {
+ pred_interp_filter = ctx->mic.interp_filter;
+ }
+
+ // PARTITION_SPLIT
+ // TODO(jingning): use the motion vectors given by the above search as
+ // the starting point of motion search in the following partition type check.
+ pc_tree->split[0]->none.rdcost = 0;
+ pc_tree->split[1]->none.rdcost = 0;
+ pc_tree->split[2]->none.rdcost = 0;
+ pc_tree->split[3]->none.rdcost = 0;
+ if (do_split || must_split) {
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ load_pred_mv(x, ctx);
+ if (bsize == BLOCK_8X8) {
+ i = 4;
+ if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+ pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+ pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+ if (sum_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
+ const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+ for (i = 0; i < 4; ++i) {
+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
+ }
+ }
+ } else {
+ for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split);
+ ++i) {
+ const int x_idx = (i & 1) * mi_step;
+ const int y_idx = (i >> 1) * mi_step;
+ int found_best_rd = 0;
+ RD_COST best_rdc_split;
+ vp9_rd_cost_reset(&best_rdc_split);
+
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+ // A must split test here increases the number of sub
+ // partitions but hurts metrics results quite a bit,
+ // so this extra test is commented out pending
+ // further tests on whether it adds much in terms of
+ // visual quality.
+ // (must_split) ? best_rdc.rate
+ // : best_rdc.rate - sum_rdc.rate,
+ // (must_split) ? best_rdc.dist
+ // : best_rdc.dist - sum_rdc.dist,
+ best_rdc_split.rate = best_rdc.rate - sum_rdc.rate;
+ best_rdc_split.dist = best_rdc.dist - sum_rdc.dist;
+ }
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
+
+ pc_tree->split[i]->index = i;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions)
+ pc_tree->split[i]->none.rate = INT_MAX;
+ found_best_rd = rd_pick_partition(
+ cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &this_rdc, best_rdc_split, pc_tree->split[i]);
+
+ if (found_best_rd == 0) {
+ sum_rdc.rdcost = INT64_MAX;
+ break;
+ } else {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ pc_tree->split[i]->none.rate != INT_MAX) {
+ const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
+ const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+ ref_frames_used[i] |= (1 << ref1);
+ if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+ }
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+ }
+ }
+ }
+
+ if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) {
+ sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+
+ if ((sum_rdc.rdcost < best_rdc.rdcost) ||
+ (must_split && (sum_rdc.dist < best_rdc.dist))) {
+ best_rdc = sum_rdc;
+ should_encode_sb = 1;
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Rate and distortion based partition search termination clause.
+ if (!cpi->sf.rd_ml_partition.search_early_termination &&
+ !x->e_mbd.lossless &&
+ ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+ (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr))) {
+ do_rect = 0;
+ }
+ }
+ } else {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ if (cpi->sf.less_rectangular_check &&
+ (bsize > cpi->sf.use_square_only_thresh_high ||
+ best_rdc.dist < dist_breakout_thr))
+ do_rect &= !partition_none_allowed;
+ }
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ pc_tree->vertical[1].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ uint8_t used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames) {
+ pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames & 0xff;
+ }
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames) {
+ pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames & 0xff;
+ }
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) {
+ pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames & 0xff;
+ }
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) {
+ pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames & 0xff;
+ }
+ }
+
+ {
+ const int do_ml_rect_partition_pruning =
+ !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split &&
+ (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8;
+ if (do_ml_rect_partition_pruning) {
+ ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed,
+ &partition_vert_allowed, best_rdc.rdcost);
+ }
+ }
+
+ // PARTITION_HORZ
+ if (partition_horz_allowed &&
+ (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+ const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ];
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ load_pred_mv(x, ctx);
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+ &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate,
+ best_rdc.dist);
+ if (sum_rdc.rdcost < INT64_MAX) {
+ sum_rdc.rate += part_mode_rate;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+ bsize > BLOCK_8X8) {
+ PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+ update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+ subsize, &pc_tree->horizontal[1],
+ best_rdc.rate - sum_rdc.rate,
+ best_rdc.dist - sum_rdc.dist);
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ should_encode_sb = 1;
+ pc_tree->partitioning = PARTITION_HORZ;
+
+ if (cpi->sf.less_rectangular_check &&
+ bsize > cpi->sf.use_square_only_thresh_high)
+ do_rect = 0;
+ }
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ // PARTITION_VERT
+ if (partition_vert_allowed &&
+ (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+ const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT];
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ load_pred_mv(x, ctx);
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[0].pred_interp_filter = pred_interp_filter;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+ &pc_tree->vertical[0], best_rdc.rate - part_mode_rate,
+ best_rdc.dist);
+ if (sum_rdc.rdcost < INT64_MAX) {
+ sum_rdc.rate += part_mode_rate;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+ bsize > BLOCK_8X8) {
+ update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+ encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
+ &pc_tree->vertical[0]);
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed)
+ pc_tree->vertical[1].pred_interp_filter = pred_interp_filter;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+ subsize, &pc_tree->vertical[1],
+ best_rdc.rate - sum_rdc.rate,
+ best_rdc.dist - sum_rdc.dist);
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ should_encode_sb = 1;
+ pc_tree->partitioning = PARTITION_VERT;
+ }
+ restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
+
+ *rd_cost = best_rdc;
+
+ if (should_encode_sb && pc_tree->index != 3) {
+ int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+ encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
+ pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
+#if CONFIG_RATE_CTRL
+ if (oxcf->use_simple_encode_api) {
+ // Store partition, motion vector of the superblock.
+ if (output_enabled) {
+ const int num_unit_rows =
+ get_num_unit_4x4(cpi->frame_info.frame_height);
+ const int num_unit_cols = get_num_unit_4x4(cpi->frame_info.frame_width);
+ store_superblock_info(pc_tree, cm->mi_grid_visible, cm->mi_stride,
+ num_4x4_blocks_wide_lookup[BLOCK_64X64],
+ num_unit_rows, num_unit_cols, mi_row << 1,
+ mi_col << 1, cpi->partition_info,
+ cpi->motion_vector_info);
+ }
+ }
+#endif // CONFIG_RATE_CTRL
+ }
+
+ if (bsize == BLOCK_64X64) {
+ assert(tp_orig < *tp);
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+
+ return should_encode_sb;
+}
+
+static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TOKENEXTRA **tp) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int mi_col_start = tile_info->mi_col_start;
+ const int mi_col_end = tile_info->mi_col_end;
+ int mi_col;
+ const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int num_sb_cols =
+ get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+ int sb_col_in_tile;
+
+ // Initialize the left context for the new SB row
+ memset(&xd->left_context, 0, sizeof(xd->left_context));
+ memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+
+ // Code each SB in the row
+ for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+ mi_col += MI_BLOCK_SIZE, sb_col_in_tile++) {
+ const struct segmentation *const seg = &cm->seg;
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_COST dummy_rdc;
+ int i;
+ int seg_skip = 0;
+ int orig_rdmult = cpi->rd.RDMULT;
+
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+
+ vp9_rd_cost_reset(&dummy_rdc);
+ (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile);
+
+ if (sf->adaptive_pred_interp_filter) {
+ for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+
+ for (i = 0; i < 64; ++i) {
+ td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ }
+ }
+
+ for (i = 0; i < MAX_REF_FRAMES; ++i) {
+ x->pred_mv[i].row = INT16_MAX;
+ x->pred_mv[i].col = INT16_MAX;
+ }
+ td->pc_root->index = 0;
+
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+
+ x->source_variance = UINT_MAX;
+
+ x->cb_rdmult = orig_rdmult;
+
+ if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+ const BLOCK_SIZE bsize =
+ seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+ cm->frame_type != KEY_FRAME) {
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rate, &dummy_dist, 1, td->pc_root);
+ } else {
+ if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) {
+ int dr =
+ get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult);
+ x->cb_rdmult = dr;
+ }
+
+ if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) {
+ x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col);
+ x->cb_rdmult = vp9_compute_rd_mult(
+ cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex));
+ }
+
+ // If required set upper and lower partition size limits
+ if (sf->auto_min_max_partition_size) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+ &x->min_partition_size, &x->max_partition_size);
+ }
+ td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_partition_time);
+#endif
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+ &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_partition_time);
+#endif
+ }
+ (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile, num_sb_cols);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void init_encode_frame_mb_context(VP9_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+ // Copy data over into macro block data structures.
+ vp9_setup_src_planes(x, cpi->Source, 0, 0);
+
+ vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+ // Note: this memset assumes above_context[0], [1] and [2]
+ // are allocated as part of the same buffer.
+ memset(xd->above_context[0], 0,
+ sizeof(*xd->above_context[0]) * 2 * aligned_mi_cols * MAX_MB_PLANE);
+ memset(xd->above_seg_context, 0,
+ sizeof(*xd->above_seg_context) * aligned_mi_cols);
+}
+
+static int check_dual_ref_flags(VP9_COMP *cpi) {
+ const int ref_flags = cpi->ref_frame_flags;
+
+ if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
+ return 0;
+ } else {
+ return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) +
+ !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+ }
+}
+
+static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
+ int mi_row, mi_col;
+ const int mis = cm->mi_stride;
+ MODE_INFO **mi_ptr = cm->mi_grid_visible;
+
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if (mi_ptr[mi_col]->tx_size > max_tx_size)
+ mi_ptr[mi_col]->tx_size = max_tx_size;
+ }
+ }
+}
+
+static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
+ if (frame_is_intra_only(&cpi->common))
+ return INTRA_FRAME;
+ else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+ return ALTREF_FRAME;
+ else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
+ return GOLDEN_FRAME;
+ else
+ return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
+ if (xd->lossless) return ONLY_4X4;
+ if (cpi->common.frame_type == KEY_FRAME && cpi->sf.use_nonrd_pick_mode)
+ return ALLOW_16X16;
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+ return ALLOW_32X32;
+ else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
+ cpi->sf.tx_size_search_method == USE_TX_8X8)
+ return TX_MODE_SELECT;
+ else
+ return cpi->common.tx_mode;
+}
+
+static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
+ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ else
+ vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ TileDataEnc *tile_data, int mi_row,
+ int mi_col) {
+ if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ } else {
+ if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF)
+ vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+ else if (bsize >= BLOCK_8X8)
+ vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+ ctx);
+ else
+ vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+ }
+}
+
+static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ TileDataEnc *tile_data, int mi_row,
+ int mi_col) {
+ if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+ vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+ } else {
+ vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx);
+ }
+}
+
+static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ BLOCK_SIZE bs = VPXMAX(bsize, BLOCK_8X8); // processing unit block size
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
+ int plane;
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+
+ mi = xd->mi[0];
+ mi->sb_type = bsize;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ memcpy(a + num_4x4_blocks_wide * plane, pd->above_context,
+ (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+ memcpy(l + num_4x4_blocks_high * plane, pd->left_context,
+ (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+ }
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
+ if (cyclic_refresh_segment_id_boosted(mi->segment_id))
+ x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+
+ if (frame_is_intra_only(cm))
+ hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+ else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+ hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+ mi_col);
+ else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
+ set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize);
+ else if (bsize >= BLOCK_8X8) {
+ if (cpi->rc.hybrid_intra_scene_change)
+ hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+ mi_col);
+ else
+ vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+ ctx);
+ } else {
+ vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+ }
+
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ memcpy(pd->above_context, a + num_4x4_blocks_wide * plane,
+ (sizeof(a[0]) * num_4x4_blocks_wide) >> pd->subsampling_x);
+ memcpy(pd->left_context, l + num_4x4_blocks_high * plane,
+ (sizeof(l[0]) * num_4x4_blocks_high) >> pd->subsampling_y);
+ }
+
+ if (rd_cost->rate == INT_MAX) vp9_rd_cost_reset(rd_cost);
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
+}
+
+static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, PC_TREE *pc_tree) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ PARTITION_TYPE partition = pc_tree->partitioning;
+ BLOCK_SIZE subsize = get_subsize(bsize, partition);
+
+ assert(bsize >= BLOCK_8X8);
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->none.mic;
+ *(x->mbmi_ext) = pc_tree->none.mbmi_ext;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+ break;
+ case PARTITION_VERT:
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->vertical[0].mic;
+ *(x->mbmi_ext) = pc_tree->vertical[0].mbmi_ext;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
+
+ if (mi_col + hbs < cm->mi_cols) {
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col + hbs);
+ *(xd->mi[0]) = pc_tree->vertical[1].mic;
+ *(x->mbmi_ext) = pc_tree->vertical[1].mbmi_ext;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize);
+ }
+ break;
+ case PARTITION_HORZ:
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+ *(xd->mi[0]) = pc_tree->horizontal[0].mic;
+ *(x->mbmi_ext) = pc_tree->horizontal[0].mbmi_ext;
+ duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
+ if (mi_row + hbs < cm->mi_rows) {
+ set_mode_info_offsets(cm, x, xd, mi_row + hbs, mi_col);
+ *(xd->mi[0]) = pc_tree->horizontal[1].mic;
+ *(x->mbmi_ext) = pc_tree->horizontal[1].mbmi_ext;
+ duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize);
+ }
+ break;
+ case PARTITION_SPLIT: {
+ fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
+ fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
+ pc_tree->split[1]);
+ fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
+ pc_tree->split[2]);
+ fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
+ pc_tree->split[3]);
+ break;
+ }
+ default: break;
+ }
+}
+
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+ pc_tree->none.pred_pixel_ready = 0;
+ pc_tree->horizontal[0].pred_pixel_ready = 0;
+ pc_tree->horizontal[1].pred_pixel_ready = 0;
+ pc_tree->vertical[0].pred_pixel_ready = 0;
+ pc_tree->vertical[1].pred_pixel_ready = 0;
+
+ if (bsize > BLOCK_8X8) {
+ BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ int i;
+ for (i = 0; i < 4; ++i) pred_pixel_ready_reset(pc_tree->split[i], subsize);
+ }
+}
+
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const NN_CONFIG *nn_config = NULL;
+
+ switch (bsize) {
+ case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break;
+ case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break;
+ case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break;
+ case BLOCK_8X8: break;
+ default: assert(0 && "Unexpected block size."); return -1;
+ }
+
+ if (!nn_config) return -1;
+
+ vpx_clear_system_state();
+
+ {
+ const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+ float features[FEATURES] = { 0.0f };
+ const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+ int feature_idx = 0;
+ float score[LABELS];
+
+ features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+ {
+ const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+ const int sb_offset_row = 8 * (mi_row & 7);
+ const int sb_offset_col = 8 * (mi_col & 7);
+ const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const int pred_stride = 64;
+ unsigned int sse;
+ int i;
+ // Variance of whole block.
+ const unsigned int var =
+ cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+ const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+ features[feature_idx++] = logf((float)var + 1.0f);
+ for (i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bs / 2;
+ const int y_idx = (i >> 1) * bs / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int pred_offset = y_idx * pred_stride + x_idx;
+ // Variance of quarter block.
+ const unsigned int sub_var =
+ cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+ pred + pred_offset, pred_stride, &sse);
+ const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+ features[feature_idx++] = var_ratio;
+ }
+ }
+
+ assert(feature_idx == FEATURES);
+ nn_predict(features, nn_config, score);
+ if (score[0] > thresh) return PARTITION_SPLIT;
+ if (score[0] < -thresh) return PARTITION_NONE;
+ return -1;
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_COST *rd_cost, int do_recon,
+ int64_t best_rd, PC_TREE *pc_tree) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
+ TOKENEXTRA *tp_orig = *tp;
+ PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+ int i;
+ BLOCK_SIZE subsize = bsize;
+ RD_COST this_rdc, sum_rdc, best_rdc;
+ int do_split = bsize >= BLOCK_8X8;
+ int do_rect = 1;
+ // Override skipping rectangular partition operations for edge blocks
+ const int force_horz_split = (mi_row + ms >= cm->mi_rows);
+ const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+ const int xss = x->e_mbd.plane[1].subsampling_x;
+ const int yss = x->e_mbd.plane[1].subsampling_y;
+
+ int partition_none_allowed = !force_horz_split && !force_vert_split;
+ int partition_horz_allowed =
+ !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
+ int partition_vert_allowed =
+ !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
+ const int use_ml_based_partitioning =
+ sf->partition_search_type == ML_BASED_PARTITION;
+
+ (void)*tp_orig;
+
+ // Avoid checking for rectangular partitions for speed >= 5.
+ if (cpi->oxcf.speed >= 5) do_rect = 0;
+
+ assert(num_8x8_blocks_wide_lookup[bsize] ==
+ num_8x8_blocks_high_lookup[bsize]);
+
+ vp9_rd_cost_init(&sum_rdc);
+ vp9_rd_cost_reset(&best_rdc);
+ best_rdc.rdcost = best_rd;
+
+ // Determine partition types in search according to the speed features.
+ // The threshold set here has to be of square block size.
+ if (sf->auto_min_max_partition_size) {
+ partition_none_allowed &=
+ (bsize <= x->max_partition_size && bsize >= x->min_partition_size);
+ partition_horz_allowed &=
+ ((bsize <= x->max_partition_size && bsize > x->min_partition_size) ||
+ force_horz_split);
+ partition_vert_allowed &=
+ ((bsize <= x->max_partition_size && bsize > x->min_partition_size) ||
+ force_vert_split);
+ do_split &= bsize > x->min_partition_size;
+ }
+ if (sf->use_square_partition_only) {
+ partition_horz_allowed &= force_horz_split;
+ partition_vert_allowed &= force_vert_split;
+ }
+
+ if (use_ml_based_partitioning) {
+ if (partition_none_allowed || do_split) do_rect = 0;
+ if (partition_none_allowed && do_split) {
+ const int ml_predicted_partition =
+ ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+ if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+ if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+ }
+ }
+
+ if (!partition_none_allowed && !do_split) do_rect = 1;
+
+ ctx->pred_pixel_ready =
+ !(partition_vert_allowed || partition_horz_allowed || do_split);
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
+ ctx);
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+ ctx->skip_txfm[0] = x->skip_txfm[0];
+ ctx->skip = x->skip;
+
+ if (this_rdc.rate != INT_MAX) {
+ const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+ this_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+
+ if (!use_ml_based_partitioning) {
+ int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist;
+ int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate;
+ dist_breakout_thr >>=
+ 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ rate_breakout_thr *= num_pels_log2_lookup[bsize];
+ if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr &&
+ this_rdc.dist < dist_breakout_thr) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+ }
+ }
+
+ // store estimated motion vector
+ store_pred_mv(x, ctx);
+
+ // PARTITION_SPLIT
+ if (do_split) {
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+ const int x_idx = (i & 1) * ms;
+ const int y_idx = (i >> 1) * ms;
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
+ load_pred_mv(x, ctx);
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+ mi_col + x_idx, subsize, &this_rdc, 0,
+ best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ } else {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ if (sf->less_rectangular_check) do_rect &= !partition_none_allowed;
+ }
+ }
+
+ // PARTITION_HORZ
+ if (partition_horz_allowed && do_rect) {
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ load_pred_mv(x, ctx);
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+ &pc_tree->horizontal[0]);
+
+ pc_tree->horizontal[0].mic = *xd->mi[0];
+ pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[0].skip = x->skip;
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
+ load_pred_mv(x, ctx);
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col, &this_rdc,
+ subsize, &pc_tree->horizontal[1]);
+
+ pc_tree->horizontal[1].mic = *xd->mi[0];
+ pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[1].skip = x->skip;
+
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
+ } else {
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_HORZ;
+ } else {
+ pred_pixel_ready_reset(pc_tree, bsize);
+ }
+ }
+
+ // PARTITION_VERT
+ if (partition_vert_allowed && do_rect) {
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ load_pred_mv(x, ctx);
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
+ &pc_tree->vertical[0]);
+ pc_tree->vertical[0].mic = *xd->mi[0];
+ pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[0].skip = x->skip;
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
+ load_pred_mv(x, ctx);
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms, &this_rdc,
+ subsize, &pc_tree->vertical[1]);
+ pc_tree->vertical[1].mic = *xd->mi[0];
+ pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[1].skip = x->skip;
+
+ if (this_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(&sum_rdc);
+ } else {
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_VERT;
+ } else {
+ pred_pixel_ready_reset(pc_tree, bsize);
+ }
+ }
+
+ *rd_cost = best_rdc;
+
+ if (best_rdc.rate == INT_MAX) {
+ vp9_rd_cost_reset(rd_cost);
+ return;
+ }
+
+ // update mode info array
+ fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree);
+
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
+ int output_enabled = (bsize == BLOCK_64X64);
+ encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
+ pc_tree);
+ }
+
+ if (bsize == BLOCK_64X64 && do_recon) {
+ assert(tp_orig < *tp);
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+
+static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MODE_INFO **mi,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int output_enabled,
+ RD_COST *rd_cost, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ const int mis = cm->mi_stride;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+ RD_COST this_rdc;
+ BLOCK_SIZE subsize_ref =
+ (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16;
+
+ vp9_rd_cost_reset(&this_rdc);
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
+ partition = partition_lookup[bsl][subsize];
+
+ if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) {
+ x->max_partition_size = BLOCK_32X32;
+ x->min_partition_size = BLOCK_16X16;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
+ 0, INT64_MAX, pc_tree);
+ } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+ subsize >= subsize_ref) {
+ x->max_partition_size = BLOCK_32X32;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
+ 0, INT64_MAX, pc_tree);
+ } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+ x->max_partition_size = BLOCK_16X16;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
+ 0, INT64_MAX, pc_tree);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ pc_tree->none.pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize,
+ &pc_tree->none);
+ pc_tree->none.mic = *xd->mi[0];
+ pc_tree->none.mbmi_ext = *x->mbmi_ext;
+ pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->none.skip = x->skip;
+ break;
+ case PARTITION_VERT:
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize,
+ &pc_tree->vertical[0]);
+ pc_tree->vertical[0].mic = *xd->mi[0];
+ pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[0].skip = x->skip;
+ if (mi_col + hbs < cm->mi_cols) {
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+ &this_rdc, subsize, &pc_tree->vertical[1]);
+ pc_tree->vertical[1].mic = *xd->mi[0];
+ pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[1].skip = x->skip;
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ }
+ break;
+ case PARTITION_HORZ:
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost, subsize,
+ &pc_tree->horizontal[0]);
+ pc_tree->horizontal[0].mic = *xd->mi[0];
+ pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[0].skip = x->skip;
+ if (mi_row + hbs < cm->mi_rows) {
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+ &this_rdc, subsize, &pc_tree->horizontal[1]);
+ pc_tree->horizontal[1].mic = *xd->mi[0];
+ pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[1].skip = x->skip;
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ }
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ subsize, output_enabled, rd_cost,
+ pc_tree->split[0]);
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
+ mi_col + hbs, subsize, output_enabled, &this_rdc,
+ pc_tree->split[1]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+ mi_row + hbs, mi_col, subsize, output_enabled,
+ &this_rdc, pc_tree->split[2]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+ mi_row + hbs, mi_col + hbs, subsize,
+ output_enabled, &this_rdc, pc_tree->split[3]);
+ if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+ rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+ rd_cost->rate += this_rdc.rate;
+ rd_cost->dist += this_rdc.dist;
+ }
+ break;
+ }
+ }
+
+ if (bsize == BLOCK_64X64 && output_enabled)
+ encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+}
+
+static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MODE_INFO **mi,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int output_enabled,
+ RD_COST *dummy_cost, PC_TREE *pc_tree) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+ const int mis = cm->mi_stride;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ subsize = (bsize >= BLOCK_8X8) ? mi[0]->sb_type : BLOCK_4X4;
+ partition = partition_lookup[bsl][subsize];
+
+ if (output_enabled && bsize != BLOCK_4X4) {
+ int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ td->counts->partition[ctx][partition]++;
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ pc_tree->none.pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, &pc_tree->none);
+ pc_tree->none.mic = *xd->mi[0];
+ pc_tree->none.mbmi_ext = *x->mbmi_ext;
+ pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->none.skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->none);
+ break;
+ case PARTITION_VERT:
+ pc_tree->vertical[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, &pc_tree->vertical[0]);
+ pc_tree->vertical[0].mic = *xd->mi[0];
+ pc_tree->vertical[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[0].skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->vertical[0]);
+ if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+ pc_tree->vertical[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, dummy_cost,
+ subsize, &pc_tree->vertical[1]);
+ pc_tree->vertical[1].mic = *xd->mi[0];
+ pc_tree->vertical[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->vertical[1].skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
+ output_enabled, subsize, &pc_tree->vertical[1]);
+ }
+ break;
+ case PARTITION_HORZ:
+ pc_tree->horizontal[0].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, &pc_tree->horizontal[0]);
+ pc_tree->horizontal[0].mic = *xd->mi[0];
+ pc_tree->horizontal[0].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[0].skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, &pc_tree->horizontal[0]);
+
+ if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+ pc_tree->horizontal[1].pred_pixel_ready = 1;
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, dummy_cost,
+ subsize, &pc_tree->horizontal[1]);
+ pc_tree->horizontal[1].mic = *xd->mi[0];
+ pc_tree->horizontal[1].mbmi_ext = *x->mbmi_ext;
+ pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+ pc_tree->horizontal[1].skip = x->skip;
+ encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
+ output_enabled, subsize, &pc_tree->horizontal[1]);
+ }
+ break;
+ default:
+ assert(partition == PARTITION_SPLIT);
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ if (bsize == BLOCK_8X8) {
+ nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+ subsize, pc_tree->leaf_split[0]);
+ encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+ subsize, pc_tree->leaf_split[0]);
+ } else {
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize,
+ output_enabled, dummy_cost, pc_tree->split[0]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp, mi_row,
+ mi_col + hbs, subsize, output_enabled, dummy_cost,
+ pc_tree->split[1]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+ mi_row + hbs, mi_col, subsize, output_enabled,
+ dummy_cost, pc_tree->split[2]);
+ nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+ mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+ dummy_cost, pc_tree->split[3]);
+ }
+ break;
+ }
+
+ if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+}
+
+// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock.
+static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+ if (!is_key_frame) {
+ MODE_INFO *mi = xd->mi[0];
+ YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *yv12_g = NULL;
+ const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
+ (mi_row + 4 < cm->mi_rows);
+ unsigned int y_sad_g, y_sad_thr;
+ unsigned int y_sad = UINT_MAX;
+
+ assert(yv12 != NULL);
+
+ if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
+ cpi->svc.use_gf_temporal_ref_current_layer) {
+ // For now, GOLDEN will not be used for non-zero spatial layers, since
+ // it may not be a temporal reference.
+ yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ }
+
+ // Only compute y_sad_g (sad for golden reference) for speed < 8.
+ if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ y_sad_g = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+ } else {
+ y_sad_g = UINT_MAX;
+ }
+
+ if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+ cpi->rc.is_src_frame_alt_ref) {
+ yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[ALTREF_FRAME - 1].sf);
+ mi->ref_frame[0] = ALTREF_FRAME;
+ y_sad_g = UINT_MAX;
+ } else {
+ vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ &cm->frame_refs[LAST_FRAME - 1].sf);
+ mi->ref_frame[0] = LAST_FRAME;
+ }
+ mi->ref_frame[1] = NONE;
+ mi->sb_type = BLOCK_64X64;
+ mi->mv[0].as_int = 0;
+ mi->interp_filter = BILINEAR;
+
+ {
+ const MV dummy_mv = { 0, 0 };
+ y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+ &dummy_mv);
+ x->sb_use_mv_part = 1;
+ x->sb_mvcol_part = mi->mv[0].as_mv.col;
+ x->sb_mvrow_part = mi->mv[0].as_mv.row;
+ }
+
+ // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+ // are close if short_circuit_low_temp_var is on.
+ y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+ if (y_sad_g < y_sad_thr) {
+ vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+ mi->ref_frame[0] = GOLDEN_FRAME;
+ mi->mv[0].as_int = 0;
+ } else {
+ x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+ }
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ xd->plane[0].dst.buf = x->est_pred;
+ xd->plane[0].dst.stride = 64;
+ vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+ } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (xd->bd) {
+ case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+ case 10:
+ memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ case 12:
+ memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+ break;
+ }
+#else
+ memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+}
+
+static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TOKENEXTRA **tp) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_col_start = tile_info->mi_col_start;
+ const int mi_col_end = tile_info->mi_col_end;
+ int mi_col;
+ const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ const int num_sb_cols =
+ get_num_cols(tile_data->tile_info, MI_BLOCK_SIZE_LOG2);
+ int sb_col_in_tile;
+
+ // Initialize the left context for the new SB row
+ memset(&xd->left_context, 0, sizeof(xd->left_context));
+ memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+
+ // Code each SB in the row
+ for (mi_col = mi_col_start, sb_col_in_tile = 0; mi_col < mi_col_end;
+ mi_col += MI_BLOCK_SIZE, ++sb_col_in_tile) {
+ const struct segmentation *const seg = &cm->seg;
+ RD_COST dummy_rdc;
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
+ BLOCK_SIZE bsize = BLOCK_64X64;
+ int seg_skip = 0;
+ int i;
+
+ (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile);
+
+ if (cpi->use_skin_detection) {
+ vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col);
+ }
+
+ x->source_variance = UINT_MAX;
+ for (i = 0; i < MAX_REF_FRAMES; ++i) {
+ x->pred_mv[i].row = INT16_MAX;
+ x->pred_mv[i].col = INT16_MAX;
+ }
+ vp9_rd_cost_init(&dummy_rdc);
+ x->color_sensitivity[0] = 0;
+ x->color_sensitivity[1] = 0;
+ x->sb_is_skin = 0;
+ x->skip_low_source_sad = 0;
+ x->lowvar_highsumdiff = 0;
+ x->content_state_sb = 0;
+ x->zero_temp_sad_source = 0;
+ x->sb_use_mv_part = 0;
+ x->sb_mvcol_part = 0;
+ x->sb_mvrow_part = 0;
+ x->sb_pickmode_part = 0;
+ x->arf_frame_usage = 0;
+ x->lastgolden_frame_usage = 0;
+
+ if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+ int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
+ int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2);
+ if (sf->adapt_partition_source_sad &&
+ (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref &&
+ source_sad > sf->adapt_partition_thresh &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)))
+ partition_search_type = REFERENCE_PARTITION;
+ }
+
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+
+ if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+ cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY &&
+ x->content_state_sb > kLowSadLowSumdiff) {
+ // For ROI with skip, force segment = 0 (no skip) over whole
+ // superblock to avoid artifacts if temporal change in source_sad is
+ // not 0.
+ int xi, yi;
+ const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+ const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+ for (yi = 0; yi < ymis; yi++)
+ for (xi = 0; xi < xmis; xi++) {
+ int map_offset = block_index + yi * cm->mi_cols + xi;
+ cpi->segmentation_map[map_offset] = 0;
+ }
+ set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+ seg_skip = 0;
+ }
+ if (seg_skip) {
+ partition_search_type = FIXED_PARTITION;
+ }
+ }
+
+ // Set the partition type of the 64X64 block
+ switch (partition_search_type) {
+ case VAR_BASED_PARTITION:
+ // TODO(jingning, marpan): The mode decision and encoding process
+ // support both intra and inter sub8x8 block coding for RTC mode.
+ // Tune the thresholds accordingly to use sub8x8 block coding for
+ // coding performance improvement.
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ break;
+ case ML_BASED_PARTITION:
+ get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+ x->max_partition_size = BLOCK_64X64;
+ x->min_partition_size = BLOCK_8X8;
+ x->sb_pickmode_part = 1;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rdc, 1, INT64_MAX,
+ td->pc_root);
+ break;
+ case SOURCE_VAR_BASED_PARTITION:
+ set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ break;
+ case FIXED_PARTITION:
+ if (!seg_skip) bsize = sf->always_this_block_size;
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ break;
+ default:
+ assert(partition_search_type == REFERENCE_PARTITION);
+ x->sb_pickmode_part = 1;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+ // Use nonrd_pick_partition on scene-cut for VBR mode.
+ // nonrd_pick_partition does not support 4x4 partition, so avoid it
+ // on key frame for now.
+ if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
+ cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+ // Use lower max_partition_size for low resoultions.
+ if (cm->width <= 352 && cm->height <= 288)
+ x->max_partition_size = BLOCK_32X32;
+ else
+ x->max_partition_size = BLOCK_64X64;
+ x->min_partition_size = BLOCK_8X8;
+ nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ BLOCK_64X64, &dummy_rdc, 1, INT64_MAX,
+ td->pc_root);
+ } else {
+ choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ // TODO(marpan): Seems like nonrd_select_partition does not support
+ // 4x4 partition. Since 4x4 is used on key frame, use this switch
+ // for now.
+ if (frame_is_intra_only(cm))
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ else
+ nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
+ }
+
+ break;
+ }
+
+ // Update ref_frame usage for inter frame if this group is ARF group.
+ if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame &&
+ !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group &&
+ cpi->sf.use_altref_onepass) {
+ int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ if (cpi->count_arf_frame_usage != NULL)
+ cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage;
+ if (cpi->count_lastgolden_frame_usage != NULL)
+ cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage;
+ }
+
+ (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
+ sb_col_in_tile, num_sb_cols);
+ }
+}
+// end RTC play code
+
+static INLINE uint32_t variance(const Diff *const d) {
+ return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint32_t variance_highbd(Diff *const d) {
+ const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
+ return (var >= 0) ? (uint32_t)var : 0;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const VP9_COMMON *const cm = &cpi->common;
+
+ const uint8_t *src = cpi->Source->y_buffer;
+ const uint8_t *last_src = cpi->Last_Source->y_buffer;
+ const int src_stride = cpi->Source->y_stride;
+ const int last_stride = cpi->Last_Source->y_stride;
+
+ // Pick cutoff threshold
+ const int cutoff = (VPXMIN(cm->width, cm->height) >= 720)
+ ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
+ : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
+ DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
+ Diff *var16 = cpi->source_diff_var;
+
+ int sum = 0;
+ int i, j;
+
+ memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+
+ for (i = 0; i < cm->mb_rows; i++) {
+ for (j = 0; j < cm->mb_cols; j++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ var16->var = variance(var16);
+ break;
+ case VPX_BITS_10:
+ vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ var16->var = variance_highbd(var16);
+ break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+ &var16->sse, &var16->sum);
+ var16->var = variance_highbd(var16);
+ break;
+ }
+ } else {
+ vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
+ &var16->sum);
+ var16->var = variance(var16);
+ }
+#else
+ vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
+ &var16->sum);
+ var16->var = variance(var16);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (var16->var >= VAR_HIST_MAX_BG_VAR)
+ hist[VAR_HIST_BINS - 1]++;
+ else
+ hist[var16->var / VAR_HIST_FACTOR]++;
+
+ src += 16;
+ last_src += 16;
+ var16++;
+ }
+
+ src = src - cm->mb_cols * 16 + 16 * src_stride;
+ last_src = last_src - cm->mb_cols * 16 + 16 * last_stride;
+ }
+
+ cpi->source_var_thresh = 0;
+
+ if (hist[VAR_HIST_BINS - 1] < cutoff) {
+ for (i = 0; i < VAR_HIST_BINS - 1; i++) {
+ sum += hist[i];
+
+ if (sum > cutoff) {
+ cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR;
+ return 0;
+ }
+ }
+ }
+
+ return sf->search_type_check_frequency;
+}
+
+static void source_var_based_partition_search_method(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+
+ if (cm->frame_type == KEY_FRAME) {
+ // For key frame, use SEARCH_PARTITION.
+ sf->partition_search_type = SEARCH_PARTITION;
+ } else if (cm->intra_only) {
+ sf->partition_search_type = FIXED_PARTITION;
+ } else {
+ if (cm->last_width != cm->width || cm->last_height != cm->height) {
+ if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
+
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+ vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
+ }
+
+ if (!cpi->frames_till_next_var_check)
+ cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
+
+ if (cpi->frames_till_next_var_check > 0) {
+ sf->partition_search_type = FIXED_PARTITION;
+ cpi->frames_till_next_var_check--;
+ }
+ }
+}
+
+static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
+ unsigned int intra_count = 0, inter_count = 0;
+ int j;
+
+ for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
+ intra_count += td->counts->intra_inter[j][0];
+ inter_count += td->counts->intra_inter[j][1];
+ }
+
+ return (intra_count << 2) < inter_count && cm->frame_type != KEY_FRAME &&
+ cm->show_frame;
+}
+
+void vp9_init_tile_data(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int tile_col, tile_row;
+ TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+ TOKENLIST *tplist = cpi->tplist[0][0];
+ int tile_tok = 0;
+ int tplist_count = 0;
+
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
+ if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tile_data,
+ vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
+ tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+#if CONFIG_MULTITHREAD
+ tile_data->row_base_thresh_freq_fact = NULL;
+#endif
+ }
+ }
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *tile_info = &this_tile->tile_info;
+ if (cpi->sf.adaptive_rd_thresh_row_mt &&
+ this_tile->row_base_thresh_freq_fact == NULL)
+ vp9_row_mt_alloc_rd_thresh(cpi, this_tile);
+ vp9_tile_init(tile_info, cm, tile_row, tile_col);
+
+ cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = cpi->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(*tile_info);
+
+ cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = cpi->tplist[tile_row][tile_col];
+ tplist_count = get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+ }
+ }
+}
+
+void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = NULL;
+ int tile_sb_row;
+ int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+
+ tile_sb_row = mi_cols_aligned_to_sb(mi_row - tile_info->mi_row_start) >>
+ MI_BLOCK_SIZE_LOG2;
+ get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
+ cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
+
+#if CONFIG_REALTIME_ONLY
+ assert(cpi->sf.use_nonrd_pick_mode);
+ encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
+ if (cpi->sf.use_nonrd_pick_mode)
+ encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ else
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#endif
+
+ cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok;
+ cpi->tplist[tile_row][tile_col][tile_sb_row].count =
+ (unsigned int)(cpi->tplist[tile_row][tile_col][tile_sb_row].stop -
+ cpi->tplist[tile_row][tile_col][tile_sb_row].start);
+ assert(tok - cpi->tplist[tile_row][tile_col][tile_sb_row].start <=
+ get_token_alloc(MI_BLOCK_SIZE >> 1, tile_mb_cols));
+
+ (void)tile_mb_cols;
+}
+
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ const int mi_row_start = tile_info->mi_row_start;
+ const int mi_row_end = tile_info->mi_row_end;
+ int mi_row;
+
+ for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE)
+ vp9_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+}
+
+static void encode_tiles(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int tile_col, tile_row;
+
+ vp9_init_tile_data(cpi);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+ vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
+static int compare_kmeans_data(const void *a, const void *b) {
+ if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
+ return 1;
+ } else if (((const KMEANS_DATA *)a)->value <
+ ((const KMEANS_DATA *)b)->value) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+static void compute_boundary_ls(const double *ctr_ls, int k,
+ double *boundary_ls) {
+ // boundary_ls[j] is the upper bound of data centered at ctr_ls[j]
+ int j;
+ for (j = 0; j < k - 1; ++j) {
+ boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.;
+ }
+ boundary_ls[k - 1] = DBL_MAX;
+}
+
+int vp9_get_group_idx(double value, double *boundary_ls, int k) {
+ int group_idx = 0;
+ while (value >= boundary_ls[group_idx]) {
+ ++group_idx;
+ if (group_idx == k - 1) {
+ break;
+ }
+ }
+ return group_idx;
+}
+
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+ KMEANS_DATA *arr, int size) {
+ int i, j;
+ int itr;
+ int group_idx;
+ double sum[MAX_KMEANS_GROUPS];
+ int count[MAX_KMEANS_GROUPS];
+
+ vpx_clear_system_state();
+
+ assert(k >= 2 && k <= MAX_KMEANS_GROUPS);
+
+ qsort(arr, size, sizeof(*arr), compare_kmeans_data);
+
+ // initialize the center points
+ for (j = 0; j < k; ++j) {
+ ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value;
+ }
+
+ for (itr = 0; itr < 10; ++itr) {
+ compute_boundary_ls(ctr_ls, k, boundary_ls);
+ for (i = 0; i < MAX_KMEANS_GROUPS; ++i) {
+ sum[i] = 0;
+ count[i] = 0;
+ }
+
+ // Both the data and centers are sorted in ascending order.
+ // As each data point is processed in order, its corresponding group index
+ // can only increase. So we only need to reset the group index to zero here.
+ group_idx = 0;
+ for (i = 0; i < size; ++i) {
+ while (arr[i].value >= boundary_ls[group_idx]) {
+ // place samples into clusters
+ ++group_idx;
+ if (group_idx == k - 1) {
+ break;
+ }
+ }
+ sum[group_idx] += arr[i].value;
+ ++count[group_idx];
+ }
+
+ for (group_idx = 0; group_idx < k; ++group_idx) {
+ if (count[group_idx] > 0)
+ ctr_ls[group_idx] = sum[group_idx] / count[group_idx];
+
+ sum[group_idx] = 0;
+ count[group_idx] = 0;
+ }
+ }
+
+ // compute group_idx, boundary_ls and count_ls
+ for (j = 0; j < k; ++j) {
+ count_ls[j] = 0;
+ }
+ compute_boundary_ls(ctr_ls, k, boundary_ls);
+ group_idx = 0;
+ for (i = 0; i < size; ++i) {
+ while (arr[i].value >= boundary_ls[group_idx]) {
+ ++group_idx;
+ if (group_idx == k - 1) {
+ break;
+ }
+ }
+ arr[i].group_idx = group_idx;
+ ++count_ls[group_idx];
+ }
+}
+
+static void encode_frame_internal(VP9_COMP *cpi) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int gf_group_index = cpi->twopass.gf_group.index;
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ vp9_zero(*td->counts);
+ vp9_zero(cpi->td.rd_counts);
+
+ xd->lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 &&
+ cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+ else
+ x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ x->highbd_inv_txfm_add =
+ xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
+#else
+ x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
+ if (xd->lossless) x->optimize = 0;
+ x->sharpness = cpi->oxcf.sharpness;
+ x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
+
+ cm->tx_mode = select_tx_mode(cpi, xd);
+
+ vp9_frame_init_quantizer(cpi);
+
+ vp9_initialize_rd_consts(cpi);
+ vp9_initialize_me_consts(cpi, x, cm->base_qindex);
+ init_encode_frame_mb_context(cpi);
+ cm->use_prev_frame_mvs =
+ !cm->error_resilient_mode && cm->width == cm->last_width &&
+ cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame;
+ // Special case: set prev_mi to NULL when the previous mode info
+ // context cannot be used.
+ cm->prev_mi =
+ cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
+
+ x->quant_fp = cpi->sf.use_quant_fp;
+ vp9_zero(x->skip_txfm);
+ if (sf->use_nonrd_pick_mode) {
+ // Initialize internal buffer pointers for rtc coding, where non-RD
+ // mode decision is used and hence no buffer pointer swap needed.
+ int i;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][0];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+ p[i].eobs = ctx->eobs_pbuf[i][0];
+ }
+ vp9_zero(x->zcoeff_blk);
+
+ if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 &&
+ !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) &&
+ !cpi->use_svc)
+ cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
+ if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
+ source_var_based_partition_search_method(cpi);
+ } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE &&
+ cpi->sf.enable_tpl_model) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+ int tpl_stride = tpl_frame->stride;
+ int64_t intra_cost_base = 0;
+ int64_t mc_dep_cost_base = 0;
+ int row, col;
+
+ for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+ for (col = 0; col < cm->mi_cols; ++col) {
+ TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+ intra_cost_base += this_stats->intra_cost;
+ mc_dep_cost_base += this_stats->mc_dep_cost;
+ }
+ }
+
+ vpx_clear_system_state();
+
+ if (tpl_frame->is_valid)
+ cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
+ }
+
+ // Frame segmentation
+ if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
+
+ {
+ struct vpx_usec_timer emr_timer;
+ vpx_usec_timer_start(&emr_timer);
+
+ if (!cpi->row_mt) {
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+ // If allowed, encoding tiles in parallel with one thread handling one
+ // tile when row based multi-threading is disabled.
+ if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+ vp9_encode_tiles_mt(cpi);
+ else
+ encode_tiles(cpi);
+ } else {
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+ vp9_encode_tiles_row_mt(cpi);
+ }
+
+ vpx_usec_timer_mark(&emr_timer);
+ cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
+ }
+
+ sf->skip_encode_frame =
+ sf->skip_encode_sb ? get_skip_encode_frame(cm, td) : 0;
+
+#if 0
+ // Keep record of the total distortion this time around for future use
+ cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+}
+
+static INTERP_FILTER get_interp_filter(
+ const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
+ if (!is_alt_ref && threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
+ threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
+ threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
+ return EIGHTTAP_SMOOTH;
+ } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
+ threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
+ return EIGHTTAP_SHARP;
+ } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
+ return EIGHTTAP;
+ } else {
+ return SWITCHABLE;
+ }
+}
+
+static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+ struct segmentation *const seg = &cm->seg;
+
+ int mi_row, mi_col;
+ int sum_delta = 0;
+ int qdelta_index;
+ int segment_id;
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ MODE_INFO **mi_8x8 = mi_8x8_ptr;
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++, mi_8x8++) {
+ segment_id = mi_8x8[0]->segment_id;
+ qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+ sum_delta += qdelta_index;
+ }
+ mi_8x8_ptr += cm->mi_stride;
+ }
+
+ return sum_delta / (cm->mi_rows * cm->mi_cols);
+}
+
+static void restore_encode_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int tile_idx;
+ int i, j;
+ TileDataEnc *tile_data;
+ RD_OPT *rd_opt = &cpi->rd;
+ for (i = 0; i < MAX_REF_FRAMES; i++) {
+ for (j = 0; j < REFERENCE_MODES; j++)
+ rd_opt->prediction_type_threshes[i][j] =
+ rd_opt->prediction_type_threshes_prev[i][j];
+
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+ rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
+ }
+
+ for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+ assert(cpi->tile_data);
+ tile_data = &cpi->tile_data[tile_idx];
+ vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
+ }
+
+ cm->interp_filter = cpi->sf.default_interp_filter;
+}
+
+void vp9_encode_frame(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ restore_encode_params(cpi);
+
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_reset_frame(MAX_MB_PLANE);
+#endif
+
+ // In the longer term the encoder should be generalized to match the
+ // decoder such that we allow compound where one of the 3 buffers has a
+ // different sign bias and that buffer is then the fixed ref. However, this
+ // requires further work in the rd loop. For now the only supported encoder
+ // side behavior is where the ALT ref buffer has opposite sign bias to
+ // the other two.
+ if (!frame_is_intra_only(cm)) {
+ if (vp9_compound_reference_allowed(cm)) {
+ cpi->allow_comp_inter_inter = 1;
+ vp9_setup_compound_reference_mode(cm);
+ } else {
+ cpi->allow_comp_inter_inter = 0;
+ }
+ }
+
+ if (cpi->sf.frame_parameter_update) {
+ int i;
+ RD_OPT *const rd_opt = &cpi->rd;
+ FRAME_COUNTS *counts = cpi->td.counts;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+ // This code does a single RD pass over the whole frame assuming
+ // either compound, single or hybrid prediction as per whatever has
+ // worked best for that type of frame in the past.
+ // It also predicts whether another coding mode would have worked
+ // better than this coding mode. If that is the case, it remembers
+ // that for subsequent frames.
+ // It also does the same analysis for transform size selection.
+ const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+ int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+ int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
+ const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+ /* prediction (compound, single or hybrid) mode selection */
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
+ cm->reference_mode = SINGLE_REFERENCE;
+ else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
+ mode_thrs[COMPOUND_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT] &&
+ check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ cm->reference_mode = COMPOUND_REFERENCE;
+ else if (mode_thrs[SINGLE_REFERENCE] > mode_thrs[REFERENCE_MODE_SELECT])
+ cm->reference_mode = SINGLE_REFERENCE;
+ else
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+
+ if (cm->interp_filter == SWITCHABLE)
+ cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_frame_internal_time);
+#endif
+ encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_frame_internal_time);
+#endif
+
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ int single_count_zero = 0;
+ int comp_count_zero = 0;
+
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+ single_count_zero += counts->comp_inter[i][0];
+ comp_count_zero += counts->comp_inter[i][1];
+ }
+
+ if (comp_count_zero == 0) {
+ cm->reference_mode = SINGLE_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ } else if (single_count_zero == 0) {
+ cm->reference_mode = COMPOUND_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ }
+ }
+
+ if (cm->tx_mode == TX_MODE_SELECT) {
+ int count4x4 = 0;
+ int count8x8_lp = 0, count8x8_8x8p = 0;
+ int count16x16_16x16p = 0, count16x16_lp = 0;
+ int count32x32 = 0;
+
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ count4x4 += counts->tx.p32x32[i][TX_4X4];
+ count4x4 += counts->tx.p16x16[i][TX_4X4];
+ count4x4 += counts->tx.p8x8[i][TX_4X4];
+
+ count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+ count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+ count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+
+ count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+ count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+ count32x32 += counts->tx.p32x32[i][TX_32X32];
+ }
+ if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+ count32x32 == 0) {
+ cm->tx_mode = ALLOW_8X8;
+ reset_skip_tx_size(cm, TX_8X8);
+ } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+ count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+ cm->tx_mode = ONLY_4X4;
+ reset_skip_tx_size(cm, TX_4X4);
+ } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+ cm->tx_mode = ALLOW_32X32;
+ } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+ cm->tx_mode = ALLOW_16X16;
+ reset_skip_tx_size(cm, TX_16X16);
+ }
+ }
+ } else {
+ FRAME_COUNTS *counts = cpi->td.counts;
+ cm->reference_mode = SINGLE_REFERENCE;
+ if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode &&
+ cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+ cm->frame_type != KEY_FRAME)
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+
+ encode_frame_internal(cpi);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ int single_count_zero = 0;
+ int comp_count_zero = 0;
+ int i;
+ for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+ single_count_zero += counts->comp_inter[i][0];
+ comp_count_zero += counts->comp_inter[i][1];
+ }
+ if (comp_count_zero == 0) {
+ cm->reference_mode = SINGLE_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ } else if (single_count_zero == 0) {
+ cm->reference_mode = COMPOUND_REFERENCE;
+ vp9_zero(counts->comp_inter);
+ }
+ }
+ }
+
+ // If segmented AQ is enabled compute the average AQ weighting.
+ if (cm->seg.enabled && (cpi->oxcf.aq_mode != NO_AQ) &&
+ (cm->seg.update_map || cm->seg.update_data)) {
+ cm->seg.aq_av_offset = compute_frame_aq_offset(cpi);
+ }
+}
+
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
+ const PREDICTION_MODE y_mode = mi->mode;
+ const PREDICTION_MODE uv_mode = mi->uv_mode;
+ const BLOCK_SIZE bsize = mi->sb_type;
+
+ if (bsize < BLOCK_8X8) {
+ int idx, idy;
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ for (idy = 0; idy < 2; idy += num_4x4_h)
+ for (idx = 0; idx < 2; idx += num_4x4_w)
+ ++counts->y_mode[0][mi->bmi[idy * 2 + idx].as_mode];
+ } else {
+ ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+ }
+
+ ++counts->uv_mode[y_mode][uv_mode];
+}
+
+static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ const VP9_COMMON *const cm = &cpi->common;
+ MV mv = mi->mv[0].as_mv;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ int x, y;
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ int map_offset = block_index + y * cm->mi_cols + x;
+ if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) &&
+ mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ if (abs(mv.row) < 8 && abs(mv.col) < 8) {
+ if (cpi->consec_zero_mv[map_offset] < 255)
+ cpi->consec_zero_mv[map_offset]++;
+ } else {
+ cpi->consec_zero_mv[map_offset] = 0;
+ }
+ }
+ }
+}
+
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ int output_enabled, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ const int seg_skip =
+ segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP);
+ x->skip_recode = !x->select_tx_size && mi->sb_type >= BLOCK_8X8 &&
+ cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
+ cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
+ cpi->sf.allow_skip_recode;
+
+ if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+ x->skip_optimize = ctx->is_coded;
+ ctx->is_coded = 1;
+ x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+ x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
+ x->q_index < QIDX_SKIP_THRESH);
+
+ if (x->skip_encode) return;
+
+ if (!is_inter_block(mi)) {
+ int plane;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (xd->above_mi == NULL || xd->left_mi == NULL) &&
+ need_top_left[mi->uv_mode])
+ assert(0);
+#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+ mi->skip = 1;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane)
+ vp9_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1);
+ if (output_enabled) sum_intra_stats(td->counts, mi);
+ vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
+ VPXMAX(bsize, BLOCK_8X8));
+ } else {
+ int ref;
+ const int is_compound = has_second_ref(mi);
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mi->ref_frame[ref]);
+ assert(cfg != NULL);
+ vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ &xd->block_refs[ref]->sf);
+ }
+ if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col,
+ VPXMAX(bsize, BLOCK_8X8));
+
+ vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col,
+ VPXMAX(bsize, BLOCK_8X8));
+
+#if CONFIG_MISMATCH_DEBUG
+ if (output_enabled) {
+ int plane;
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]);
+ const int bw = get_block_width(plane_bsize);
+ const int bh = get_block_height(plane_bsize);
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+ pd->subsampling_x, pd->subsampling_y);
+
+ mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c,
+ pixel_r, bw, bh,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+ }
+#endif
+
+ vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled);
+ vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
+ VPXMAX(bsize, BLOCK_8X8));
+ }
+
+ if (seg_skip) {
+ assert(mi->skip);
+ }
+
+ if (output_enabled) {
+ if (cm->tx_mode == TX_MODE_SELECT && mi->sb_type >= BLOCK_8X8 &&
+ !(is_inter_block(mi) && mi->skip)) {
+ ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
+ &td->counts->tx)[mi->tx_size];
+ } else {
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter_block(mi)) {
+ mi->tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+ max_txsize_lookup[bsize]);
+ } else {
+ mi->tx_size = (bsize >= BLOCK_8X8) ? mi->tx_size : TX_4X4;
+ }
+ }
+
+ ++td->counts->tx.tx_totals[mi->tx_size];
+ ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
+ if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->cyclic_refresh->content_mode)
+ vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
+ if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+ (!cpi->use_svc ||
+ (cpi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
+ update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h
new file mode 100644
index 0000000000..fd0a9c517e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct macroblock;
+struct yv12_buffer_config;
+struct VP9_COMP;
+struct ThreadData;
+
+// Constants used in SOURCE_VAR_BASED_PARTITION
+#define VAR_HIST_MAX_BG_VAR 1000
+#define VAR_HIST_FACTOR 10
+#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1)
+#define VAR_HIST_LARGE_CUT_OFF 75
+#define VAR_HIST_SMALL_CUT_OFF 45
+
+void vp9_setup_src_planes(struct macroblock *x,
+ const struct yv12_buffer_config *src, int mi_row,
+ int mi_col);
+
+void vp9_encode_frame(struct VP9_COMP *cpi);
+
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row,
+ int tile_col);
+
+void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
+
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q,
+ int content_state);
+
+struct KMEANS_DATA;
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+ struct KMEANS_DATA *arr, int size);
+int vp9_get_group_idx(double value, double *boundary_ls, int k);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c
new file mode 100644
index 0000000000..946a1c3ee8
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.c
@@ -0,0 +1,1078 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+struct optimize_ctx {
+ ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
+ ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
+};
+
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride,
+ x->e_mbd.bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+}
+
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+ { 10, 6 },
+ { 8, 5 },
+};
+
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+ (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)))
+
+int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ int ctx) {
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ref = is_inter_block(xd->mi[0]);
+ uint8_t token_cache[1024];
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const int eob = p->eobs[block];
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const int default_eob = 16 << (tx_size << 1);
+ const int shift = (tx_size == TX_32X32);
+ const int16_t *const dequant_ptr = pd->dequant;
+ const uint8_t *const band_translate = get_band_translate(tx_size);
+ const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
+ const int16_t *const scan = so->scan;
+ const int16_t *const nb = so->neighbors;
+ const MODE_INFO *mbmi = xd->mi[0];
+ const int sharpness = mb->sharpness;
+ const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type];
+ const int64_t rdmult =
+ (sharpness == 0 ? rdadj >> 1
+ : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4);
+
+ const int64_t rddiv = mb->rddiv;
+ int64_t rd_cost0, rd_cost1;
+ int64_t rate0, rate1;
+ int16_t t0, t1;
+ int i, final_eob;
+ int count_high_values_after_eob = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+ unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ mb->token_costs[tx_size][plane_type][ref];
+ unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+ int64_t eob_cost0, eob_cost1;
+ const int ctx0 = ctx;
+ int64_t accu_rate = 0;
+ // Initialized to the worst possible error for the largest transform size.
+ // This ensures that it never goes negative.
+ int64_t accu_error = ((int64_t)1) << 50;
+ int64_t best_block_rd_cost = INT64_MAX;
+ int x_prev = 1;
+ tran_low_t before_best_eob_qc = 0;
+ tran_low_t before_best_eob_dqc = 0;
+
+ assert((!plane_type && !plane) || (plane_type && plane));
+ assert(eob <= default_eob);
+
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
+ }
+ final_eob = 0;
+
+ // Initial RD cost.
+ token_costs_cur = token_costs + band_translate[0];
+ rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+ best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+ // For each token, pick one of two choices greedily:
+ // (i) First candidate: Keep current quantized value, OR
+ // (ii) Second candidate: Reduce quantized value by 1.
+ for (i = 0; i < eob; i++) {
+ const int rc = scan[i];
+ const int x = qcoeff[rc];
+ const int band_cur = band_translate[i];
+ const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+ const int token_tree_sel_cur = (x_prev == 0);
+ token_costs_cur = token_costs + band_cur;
+ if (x == 0) { // No need to search
+ const int token = vp9_get_token(x);
+ rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
+ accu_rate += rate0;
+ x_prev = 0;
+ // Note: accu_error does not change.
+ } else {
+ const int dqv = dequant_ptr[rc != 0];
+ // Compute the distortion for quantizing to 0.
+ const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+ const int diff_for_zero =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+ :
+#endif
+ diff_for_zero_raw;
+ const int64_t distortion_for_zero =
+ (int64_t)diff_for_zero * diff_for_zero;
+
+ // Compute the distortion for the first candidate
+ const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+ const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+ :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ diff0_raw;
+ const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+ // Compute the distortion for the second candidate
+ const int sign = -(x < 0); // -1 if x is negative and 0 otherwise.
+ const int x1 = x - 2 * sign - 1; // abs(x1) = abs(x) - 1.
+ int64_t distortion1;
+ if (x1 != 0) {
+ const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+ :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ dqv;
+ const int diff_step = (dqv_step + sign) ^ sign;
+ const int diff1 = diff0 - diff_step;
+ assert(dqv > 0); // We aren't right shifting a negative number above.
+ distortion1 = (int64_t)diff1 * diff1;
+ } else {
+ distortion1 = distortion_for_zero;
+ }
+ {
+ // Calculate RDCost for current coeff for the two candidates.
+ const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+ const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+ rate0 =
+ base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+ rate1 =
+ base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
+ }
+ {
+ int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+ int dqc0, dqc1;
+ int64_t best_eob_cost_cur;
+ int use_x1;
+
+ // Calculate RD Cost effect on the next coeff for the two candidates.
+ int64_t next_bits0 = 0;
+ int64_t next_bits1 = 0;
+ int64_t next_eob_bits0 = 0;
+ int64_t next_eob_bits1 = 0;
+ if (i < default_eob - 1) {
+ int ctx_next, token_tree_sel_next;
+ const int band_next = band_translate[i + 1];
+ const int token_next =
+ (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+ unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+ [ENTROPY_TOKENS] =
+ token_costs + band_next;
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x == 0);
+ next_bits0 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+ next_eob_bits0 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ ctx_next = get_coef_context(nb, token_cache, i + 1);
+ token_tree_sel_next = (x1 == 0);
+ next_bits1 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+ if (x1 != 0) {
+ next_eob_bits1 =
+ (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+ }
+ }
+
+ // Compare the total RD costs for two candidates.
+ rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+ rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+ rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+ eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+ (accu_error + distortion0 - distortion_for_zero));
+ eob_cost1 = eob_cost0;
+ if (x1 != 0) {
+ eob_cost1 =
+ RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+ (accu_error + distortion1 - distortion_for_zero));
+ eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+ } else {
+ eob_rdcost_better_for_x1 = 0;
+ }
+
+ // Calculate the two candidate de-quantized values.
+ dqc0 = dqcoeff[rc];
+ dqc1 = 0;
+ if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+ if (x1 != 0) {
+ dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+ } else {
+ dqc1 = 0;
+ }
+ }
+
+ // Pick and record the better quantized and de-quantized values.
+ if (rdcost_better_for_x1) {
+ qcoeff[rc] = x1;
+ dqcoeff[rc] = dqc1;
+ accu_rate += rate1;
+ accu_error += distortion1 - distortion_for_zero;
+ assert(distortion1 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t1];
+ } else {
+ accu_rate += rate0;
+ accu_error += distortion0 - distortion_for_zero;
+ assert(distortion0 <= distortion_for_zero);
+ token_cache[rc] = vp9_pt_energy_class[t0];
+ }
+ if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++;
+ assert(accu_error >= 0);
+ x_prev = qcoeff[rc]; // Update based on selected quantized value.
+
+ use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+ best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
+
+ // Determine whether to move the eob position to i+1
+ if (best_eob_cost_cur < best_block_rd_cost) {
+ best_block_rd_cost = best_eob_cost_cur;
+ final_eob = i + 1;
+ count_high_values_after_eob = 0;
+ if (use_x1) {
+ before_best_eob_qc = x1;
+ before_best_eob_dqc = dqc1;
+ } else {
+ before_best_eob_qc = x;
+ before_best_eob_dqc = dqc0;
+ }
+ }
+ }
+ }
+ }
+ if (count_high_values_after_eob > 0) {
+ final_eob = eob - 1;
+ for (; final_eob >= 0; final_eob--) {
+ const int rc = scan[final_eob];
+ const int x = qcoeff[rc];
+ if (x) {
+ break;
+ }
+ }
+ final_eob++;
+ } else {
+ assert(final_eob <= eob);
+ if (final_eob > 0) {
+ int rc;
+ assert(before_best_eob_qc != 0);
+ i = final_eob - 1;
+ rc = scan[i];
+ qcoeff[rc] = before_best_eob_qc;
+ dqcoeff[rc] = before_best_eob_dqc;
+ }
+ for (i = final_eob; i < eob; i++) {
+ int rc = scan[i];
+ qcoeff[rc] = 0;
+ dqcoeff[rc] = 0;
+ }
+ }
+ mb->plane[plane].eobs[block] = final_eob;
+ return final_eob;
+}
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
+
+static INLINE void fdct32x32(int rd_transform, const int16_t *src,
+ tran_low_t *dst, int src_stride) {
+ if (rd_transform)
+ vpx_fdct32x32_rd(src, dst, src_stride);
+ else
+ vpx_fdct32x32(src, dst, src_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
+ tran_low_t *dst, int src_stride) {
+ if (rd_transform)
+ vpx_highbd_fdct32x32_rd(src, dst, src_stride);
+ else
+ vpx_highbd_fdct32x32(src, dst, src_stride);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
+ qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_16X16:
+ vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+ vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+ vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_16X16:
+ vpx_fdct16x16(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_fdct8x8(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order->scan, scan_order->iscan);
+
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+ pd->dequant, eob, scan_order->scan, scan_order->iscan);
+ break;
+ }
+}
+
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff,
+ dqcoeff, pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff,
+ dqcoeff, pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff,
+ dqcoeff, pd->dequant[0], eob);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff,
+ dqcoeff, pd->dequant[0], eob);
+ break;
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_16X16:
+ vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ case TX_8X8:
+ vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+ vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+ pd->dequant[0], eob);
+ break;
+ }
+}
+
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (tx_size) {
+ case TX_32X32:
+ highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
+ break;
+ case TX_16X16:
+ vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ break;
+ }
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
+ break;
+ case TX_16X16:
+ vpx_fdct16x16(src_diff, coeff, diff_stride);
+ vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_fdct8x8(src_diff, coeff, diff_stride);
+ vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ }
+}
+
+static void encode_block(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct encode_b_args *const args = arg;
+#if CONFIG_MISMATCH_DEBUG
+ int mi_row = args->mi_row;
+ int mi_col = args->mi_col;
+ int output_enabled = args->output_enabled;
+#endif
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *dst;
+ ENTROPY_CONTEXT *a, *l;
+ dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+ a = &args->ta[col];
+ l = &args->tl[row];
+
+ // TODO(jingning): per transformed block zero forcing only enabled for
+ // luma component. will integrate chroma components as well.
+ if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+ p->eobs[block] = 0;
+ *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+ goto encode_block_end;
+#else
+ return;
+#endif
+ }
+
+ if (!x->skip_recode) {
+ if (x->quant_fp) {
+ // Encoding process for rtc mode
+ if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+ goto encode_block_end;
+#else
+ return;
+#endif
+ } else {
+ vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size);
+ }
+ } else {
+ if (max_txsize_lookup[plane_bsize] == tx_size) {
+ int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+ if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
+ } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
+ // fast path forward transform and quantization
+ vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size);
+ } else {
+ // skip forward transform
+ p->eobs[block] = 0;
+ *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+ goto encode_block_end;
+#else
+ return;
+#endif
+ }
+ } else {
+ vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
+ }
+ }
+ }
+
+ if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+ const int ctx = combine_entropy_contexts(*a, *l);
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, ctx) > 0;
+ } else {
+ *a = *l = p->eobs[block] > 0;
+ }
+
+ if (p->eobs[block]) *(args->skip) = 0;
+
+ if (x->skip_encode || p->eobs[block] == 0) {
+#if CONFIG_MISMATCH_DEBUG
+ goto encode_block_end;
+#else
+ return;
+#endif
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+ xd->bd);
+ break;
+ }
+#if CONFIG_MISMATCH_DEBUG
+ goto encode_block_end;
+#else
+ return;
+#endif
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ break;
+ case TX_16X16:
+ vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ break;
+ case TX_8X8:
+ vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ break;
+ }
+#if CONFIG_MISMATCH_DEBUG
+encode_block_end:
+ if (output_enabled) {
+ int pixel_c, pixel_r;
+ int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+ int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row,
+ pd->subsampling_x, pd->subsampling_y);
+ mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r,
+ blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static void encode_block_pass1(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ MACROBLOCK *const x = (MACROBLOCK *)arg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *dst;
+ dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+ vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
+
+ if (p->eobs[block] > 0) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+ p->eobs[block], xd->bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+ }
+}
+
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ vp9_subtract_plane(x, bsize, 0);
+ vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+ encode_block_pass1, x);
+}
+
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int output_enabled) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx ctx;
+ MODE_INFO *mi = xd->mi[0];
+ int plane;
+#if CONFIG_MISMATCH_DEBUG
+ struct encode_b_args arg = { x,
+ 1, // enable_trellis_opt
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ NULL, // above entropy context
+ NULL, // left entropy context
+ &mi->skip, mi_row, mi_col, output_enabled };
+#else
+ struct encode_b_args arg = { x,
+ 1, // enable_trellis_opt
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ NULL, // above entropy context
+ NULL, // left entropy context
+ &mi->skip };
+ (void)mi_row;
+ (void)mi_col;
+ (void)output_enabled;
+#endif
+
+ mi->skip = 1;
+
+ if (x->skip) return;
+
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ if (!x->skip_recode) vp9_subtract_plane(x, bsize, plane);
+
+ if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+ vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
+ ctx.tl[plane]);
+ arg.enable_trellis_opt = 1;
+ } else {
+ arg.enable_trellis_opt = 0;
+ }
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+
+ vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+ &arg);
+ }
+}
+
+void vp9_encode_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const ScanOrder *scan_order;
+ TX_TYPE tx_type = DCT_DCT;
+ PREDICTION_MODE mode;
+ const int bwl = b_width_log2_lookup[plane_bsize];
+ const int diff_stride = 4 * (1 << bwl);
+ uint8_t *src, *dst;
+ int16_t *src_diff;
+ uint16_t *eob = &p->eobs[block];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ int enable_trellis_opt = !x->skip_recode;
+ ENTROPY_CONTEXT *a = NULL;
+ ENTROPY_CONTEXT *l = NULL;
+ int entropy_ctx = 0;
+ dst = &pd->dst.buf[4 * (row * dst_stride + col)];
+ src = &p->src.buf[4 * (row * src_stride + col)];
+ src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+
+ if (tx_size == TX_4X4) {
+ tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
+ scan_order = &vp9_scan_orders[TX_4X4][tx_type];
+ mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mi->uv_mode;
+ } else {
+ mode = plane == 0 ? mi->mode : mi->uv_mode;
+ if (tx_size == TX_32X32) {
+ scan_order = &vp9_default_scan_orders[TX_32X32];
+ } else {
+ tx_type = get_tx_type(get_plane_type(plane), xd);
+ scan_order = &vp9_scan_orders[tx_size][tx_type];
+ }
+ }
+
+ vp9_predict_intra_block(
+ xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
+ (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
+ dst_stride, col, row, plane);
+
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+ if (!x->skip_recode) {
+ const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride,
+ xd->bd);
+ } else {
+ vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride);
+ }
+#else
+ vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+ diff_stride, src, src_stride, dst, dst_stride);
+#endif
+ enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+ plane_bsize, tx_size, args);
+ }
+
+ if (enable_trellis_opt) {
+ a = &args->ta[col];
+ l = &args->tl[row];
+ entropy_ctx = combine_entropy_contexts(*a, *l);
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
+ switch (tx_size) {
+ case TX_32X32:
+ if (!x->skip_recode) {
+ highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
+ }
+ break;
+ case TX_16X16:
+ if (!x->skip_recode) {
+ if (tx_type == DCT_DCT)
+ vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
+ else
+ vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+ vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order->scan, scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
+ xd->bd);
+ }
+ break;
+ case TX_8X8:
+ if (!x->skip_recode) {
+ if (tx_type == DCT_DCT)
+ vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
+ else
+ vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+ vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order->scan, scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob) {
+ vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,
+ xd->bd);
+ }
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ if (!x->skip_recode) {
+ if (tx_type != DCT_DCT)
+ vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+ else
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, scan_order->scan, scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT) {
+ // this is like vp9_short_idct4x4 but has a special case around
+ // eob<=1 which is significant (not just an optimization) for the
+ // lossless case.
+ x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
+ } else {
+ vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
+ xd->bd);
+ }
+ }
+ break;
+ }
+ if (*eob) *(args->skip) = 0;
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ switch (tx_size) {
+ case TX_32X32:
+ if (!x->skip_recode) {
+ fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
+ vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob)
+ vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
+ break;
+ case TX_16X16:
+ if (!x->skip_recode) {
+ vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
+ vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob)
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
+ break;
+ case TX_8X8:
+ if (!x->skip_recode) {
+ vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
+ vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob)
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ if (!x->skip_recode) {
+ if (tx_type != DCT_DCT)
+ vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
+ else
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ }
+ if (enable_trellis_opt) {
+ *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
+ }
+ if (!x->skip_encode && *eob) {
+ if (tx_type == DCT_DCT)
+ // this is like vp9_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob);
+ else
+ vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
+ }
+ break;
+ }
+ if (*eob) *(args->skip) = 0;
+}
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+ int enable_trellis_opt) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx ctx;
+#if CONFIG_MISMATCH_DEBUG
+ // TODO(angiebird): make mismatch_debug support intra mode
+ struct encode_b_args arg = {
+ x,
+ enable_trellis_opt,
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ ctx.ta[plane],
+ ctx.tl[plane],
+ &xd->mi[0]->skip,
+ 0, // mi_row
+ 0, // mi_col
+ 0 // output_enabled
+ };
+#else
+ struct encode_b_args arg = { x,
+ enable_trellis_opt,
+ 0.0, // trellis_opt_thresh
+ NULL, // &sse_calc_done
+ NULL, // &sse
+ ctx.ta[plane],
+ ctx.tl[plane],
+ &xd->mi[0]->skip };
+#endif
+
+ if (enable_trellis_opt && x->optimize &&
+ (!x->skip_recode || !x->skip_optimize)) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE tx_size =
+ plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
+ vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+ } else {
+ arg.enable_trellis_opt = 0;
+ }
+
+ vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+ vp9_encode_block_intra, &arg);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h
new file mode 100644
index 0000000000..1391446bed
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemb.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_
+
+#include "./vpx_config.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct encode_b_args {
+ MACROBLOCK *x;
+ int enable_trellis_opt;
+ double trellis_opt_thresh;
+ int *sse_calc_done;
+ int64_t *sse;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ int8_t *skip;
+#if CONFIG_MISMATCH_DEBUG
+ int mi_row;
+ int mi_col;
+ int output_enabled;
+#endif
+};
+int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
+ int ctx);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int output_enabled);
+void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+
+void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+void vp9_encode_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+ int enable_trellis_opt);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c
new file mode 100644
index 0000000000..023d087c2c
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropymode.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemv.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+
+void vp9_entropy_mv_init(void) {
+ vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+ vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+ vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
+
+static void encode_mv_component(vpx_writer *w, int comp,
+ const nmv_component *mvcomp, int usehp) {
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = vp9_get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ assert(comp != 0);
+
+ // Sign
+ vpx_write(w, sign, mvcomp->sign);
+
+ // Class
+ vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
+ &mv_class_encodings[mv_class]);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ vpx_write(w, d, mvcomp->class0[0]);
+ } else {
+ int i;
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (i = 0; i < n; ++i) vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
+ }
+
+ // Fractional bits
+ vp9_write_token(w, vp9_mv_fp_tree,
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp[d] : mvcomp->fp,
+ &mv_fp_encodings[fr]);
+
+ // High precision bit
+ if (usehp)
+ vpx_write(w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ int usehp) {
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+ int class0_hp_cost[2], hp_cost[2];
+ int i;
+ int c, o;
+
+ sign_cost[0] = vp9_cost_zero(mvcomp->sign);
+ sign_cost[1] = vp9_cost_one(mvcomp->sign);
+ vp9_cost_tokens(class_cost, mvcomp->classes, vp9_mv_class_tree);
+ vp9_cost_tokens(class0_cost, mvcomp->class0, vp9_mv_class0_tree);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ bits_cost[i][0] = vp9_cost_zero(mvcomp->bits[i]);
+ bits_cost[i][1] = vp9_cost_one(mvcomp->bits[i]);
+ }
+
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ vp9_cost_tokens(class0_fp_cost[i], mvcomp->class0_fp[i], vp9_mv_fp_tree);
+ vp9_cost_tokens(fp_cost, mvcomp->fp, vp9_mv_fp_tree);
+
+ // Always build the hp costs to avoid an uninitialized warning from gcc
+ class0_hp_cost[0] = vp9_cost_zero(mvcomp->class0_hp);
+ class0_hp_cost[1] = vp9_cost_one(mvcomp->class0_hp);
+ hp_cost[0] = vp9_cost_zero(mvcomp->hp);
+ hp_cost[1] = vp9_cost_one(mvcomp->hp);
+
+ mvcost[0] = 0;
+ // MV_CLASS_0
+ for (o = 0; o < (CLASS0_SIZE << 3); ++o) {
+ int d, e, f;
+ int cost = class_cost[MV_CLASS_0];
+ int v = o + 1;
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ cost += class0_cost[d];
+ cost += class0_fp_cost[d][f];
+ if (usehp) {
+ e = (o & 1); /* high precision mv data */
+ cost += class0_hp_cost[e];
+ }
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ }
+ for (c = MV_CLASS_1; c < MV_CLASSES; ++c) {
+ int d;
+ for (d = 0; d < (1 << c); ++d) {
+ int f;
+ int whole_cost = class_cost[c];
+ int b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i) whole_cost += bits_cost[i][((d >> i) & 1)];
+ for (f = 0; f < 4; ++f) {
+ int cost = whole_cost + fp_cost[f];
+ int v = (CLASS0_SIZE << (c + 2)) + d * 8 + f * 2 /* + e */ + 1;
+ if (usehp) {
+ mvcost[v] = cost + hp_cost[0] + sign_cost[0];
+ mvcost[-v] = cost + hp_cost[0] + sign_cost[1];
+ if (v + 1 > MV_MAX) break;
+ mvcost[v + 1] = cost + hp_cost[1] + sign_cost[0];
+ mvcost[-v - 1] = cost + hp_cost[1] + sign_cost[1];
+ } else {
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ if (v + 1 > MV_MAX) break;
+ mvcost[v + 1] = cost + sign_cost[0];
+ mvcost[-v - 1] = cost + sign_cost[1];
+ }
+ }
+ }
+ }
+}
+
+static int update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+ vpx_prob upd_p) {
+ const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+ const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
+ cost_branch256(ct, new_p) + vp9_cost_one(upd_p) +
+ (7 << VP9_PROB_COST_SHIFT);
+ vpx_write(w, update, upd_p);
+ if (update) {
+ *cur_p = new_p;
+ vpx_write_literal(w, new_p >> 1, 7);
+ }
+ return update;
+}
+
+static void write_mv_update(const vpx_tree_index *tree,
+ vpx_prob probs[/*n - 1*/],
+ const unsigned int counts[/*n - 1*/], int n,
+ vpx_writer *w) {
+ int i;
+ unsigned int branch_ct[32][2];
+
+ // Assuming max number of probabilities <= 32
+ assert(n <= 32);
+
+ vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+ for (i = 0; i < n - 1; ++i)
+ update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
+}
+
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+ nmv_context_counts *const counts) {
+ int i, j;
+ nmv_context *const mvc = &cm->fc->nmvc;
+
+ write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
+
+ for (i = 0; i < 2; ++i) {
+ nmv_component *comp = &mvc->comps[i];
+ nmv_component_counts *comp_counts = &counts->comps[i];
+
+ update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+ write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+ MV_CLASSES, w);
+ write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+ CLASS0_SIZE, w);
+ for (j = 0; j < MV_OFFSET_BITS; ++j)
+ update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ for (j = 0; j < CLASS0_SIZE; ++j)
+ write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+ counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+ write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+ MV_FP_SIZE, w);
+ }
+
+ if (usehp) {
+ for (i = 0; i < 2; ++i) {
+ update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+ MV_UPDATE_PROB);
+ update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+ }
+ }
+}
+
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
+ const nmv_context *mvctx, int usehp,
+ unsigned int *const max_mv_magnitude) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
+ usehp = usehp && use_mv_hp(ref);
+
+ vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (cpi->sf.mv.auto_mv_step_size) {
+ const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3;
+ *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude);
+ }
+}
+
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx, int usehp) {
+ vp9_cost_tokens(mvjoint, ctx->joints, vp9_mv_joint_tree);
+ build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], usehp);
+ build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
+}
+
+static void inc_mvs(const MODE_INFO *mi, const MB_MODE_INFO_EXT *mbmi_ext,
+ const int_mv mvs[2], nmv_context_counts *counts) {
+ int i;
+
+ for (i = 0; i < 1 + has_second_ref(mi); ++i) {
+ const MV *ref = &mbmi_ext->ref_mvs[mi->ref_frame[i]][0].as_mv;
+ const MV diff = { mvs[i].as_mv.row - ref->row,
+ mvs[i].as_mv.col - ref->col };
+ vp9_inc_mv(&diff, counts);
+ }
+}
+
+void vp9_update_mv_count(ThreadData *td) {
+ const MACROBLOCKD *xd = &td->mb.e_mbd;
+ const MODE_INFO *mi = xd->mi[0];
+ const MB_MODE_INFO_EXT *mbmi_ext = td->mb.mbmi_ext;
+
+ if (mi->sb_type < BLOCK_8X8) {
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[mi->sb_type];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[mi->sb_type];
+ int idx, idy;
+
+ for (idy = 0; idy < 2; idy += num_4x4_h) {
+ for (idx = 0; idx < 2; idx += num_4x4_w) {
+ const int i = idy * 2 + idx;
+ if (mi->bmi[i].as_mode == NEWMV)
+ inc_mvs(mi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
+ }
+ }
+ } else {
+ if (mi->mode == NEWMV) inc_mvs(mi, mbmi_ext, mi->mv, &td->counts->mv);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h
new file mode 100644
index 0000000000..2f1be4b233
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodemv.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_entropy_mv_init(void);
+
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w,
+ nmv_context_counts *const counts);
+
+void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
+ const nmv_context *mvctx, int usehp,
+ unsigned int *const max_mv_magnitude);
+
+void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx, int usehp);
+
+void vp9_update_mv_count(ThreadData *td);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
new file mode 100644
index 0000000000..9d5c0030a2
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
@@ -0,0 +1,7030 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
+#include "vpx_ports/vpx_timer.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
+#if CONFIG_VP9_POSTPROC
+#include "vp9/common/vp9_postproc.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_alt_ref_aq.h"
+#include "vp9/encoder/vp9_aq_360.h"
+#include "vp9/encoder/vp9_aq_complexity.h"
+#endif
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_variance.h"
+#endif
+#include "vp9/encoder/vp9_bitstream.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_blockiness.h"
+#endif
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/encoder/vp9_mcomp.h"
+#endif
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_resize.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vp9/vp9_cx_iface.h"
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
+
+#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold
+#define FRAME_RATE_FACTOR 8
+
+#ifdef OUTPUT_YUV_DENOISED
+FILE *yuv_denoised_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+static FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#endif
+#ifdef OUTPUT_YUV_SVC_SRC
+FILE *yuv_svc_src[3] = { NULL, NULL, NULL };
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#ifdef ENABLE_KF_DENOISE
+// Test condition for spatial denoise of source.
+static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+ return (oxcf->pass != 1) && !is_lossless_requested(&cpi->oxcf) &&
+ frame_is_intra_only(cm);
+}
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+// compute adaptive threshold for skip recoding
+static int compute_context_model_thresh(const VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_size = (cm->width * cm->height) >> 10;
+ const int bitrate = (int)(oxcf->target_bandwidth >> 10);
+ const int qindex_factor = cm->base_qindex + (MAXQ >> 1);
+
+ // This equation makes the threshold adaptive to frame size.
+ // Coding gain obtained by recoding comes from alternate frames of large
+ // content change. We skip recoding if the difference of previous and current
+ // frame context probability model is less than a certain threshold.
+ // The first component is the most critical part to guarantee adaptivity.
+ // Other parameters are estimated based on normal setting of hd resolution
+ // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+ const int thresh =
+ ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
+ qindex_factor) >>
+ 9;
+
+ return thresh;
+}
+
+// compute the total cost difference between current
+// and previous frame context prob model.
+static int compute_context_model_diff(const VP9_COMMON *const cm) {
+ const FRAME_CONTEXT *const pre_fc =
+ &cm->frame_contexts[cm->frame_context_idx];
+ const FRAME_CONTEXT *const cur_fc = cm->fc;
+ const FRAME_COUNTS *counts = &cm->counts;
+ vpx_prob pre_last_prob, cur_last_prob;
+ int diff = 0;
+ int i, j, k, l, m, n;
+
+ // y_mode_prob
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ for (j = 0; j < INTRA_MODES - 1; ++j) {
+ diff += (int)counts->y_mode[i][j] *
+ (pre_fc->y_mode_prob[i][j] - cur_fc->y_mode_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->y_mode_prob[i][INTRA_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->y_mode_prob[i][INTRA_MODES - 2];
+
+ diff += (int)counts->y_mode[i][INTRA_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // uv_mode_prob
+ for (i = 0; i < INTRA_MODES; ++i) {
+ for (j = 0; j < INTRA_MODES - 1; ++j) {
+ diff += (int)counts->uv_mode[i][j] *
+ (pre_fc->uv_mode_prob[i][j] - cur_fc->uv_mode_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->uv_mode_prob[i][INTRA_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->uv_mode_prob[i][INTRA_MODES - 2];
+
+ diff += (int)counts->uv_mode[i][INTRA_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // partition_prob
+ for (i = 0; i < PARTITION_CONTEXTS; ++i) {
+ for (j = 0; j < PARTITION_TYPES - 1; ++j) {
+ diff += (int)counts->partition[i][j] *
+ (pre_fc->partition_prob[i][j] - cur_fc->partition_prob[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->partition_prob[i][PARTITION_TYPES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->partition_prob[i][PARTITION_TYPES - 2];
+
+ diff += (int)counts->partition[i][PARTITION_TYPES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // coef_probs
+ for (i = 0; i < TX_SIZES; ++i) {
+ for (j = 0; j < PLANE_TYPES; ++j) {
+ for (k = 0; k < REF_TYPES; ++k) {
+ for (l = 0; l < COEF_BANDS; ++l) {
+ for (m = 0; m < BAND_COEFF_CONTEXTS(l); ++m) {
+ for (n = 0; n < UNCONSTRAINED_NODES; ++n) {
+ diff += (int)counts->coef[i][j][k][l][m][n] *
+ (pre_fc->coef_probs[i][j][k][l][m][n] -
+ cur_fc->coef_probs[i][j][k][l][m][n]);
+ }
+
+ pre_last_prob =
+ MAX_PROB -
+ pre_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+ cur_last_prob =
+ MAX_PROB -
+ cur_fc->coef_probs[i][j][k][l][m][UNCONSTRAINED_NODES - 1];
+
+ diff += (int)counts->coef[i][j][k][l][m][UNCONSTRAINED_NODES] *
+ (pre_last_prob - cur_last_prob);
+ }
+ }
+ }
+ }
+ }
+
+ // switchable_interp_prob
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
+ for (j = 0; j < SWITCHABLE_FILTERS - 1; ++j) {
+ diff += (int)counts->switchable_interp[i][j] *
+ (pre_fc->switchable_interp_prob[i][j] -
+ cur_fc->switchable_interp_prob[i][j]);
+ }
+ pre_last_prob =
+ MAX_PROB - pre_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+ cur_last_prob =
+ MAX_PROB - cur_fc->switchable_interp_prob[i][SWITCHABLE_FILTERS - 2];
+
+ diff += (int)counts->switchable_interp[i][SWITCHABLE_FILTERS - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // inter_mode_probs
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ for (j = 0; j < INTER_MODES - 1; ++j) {
+ diff += (int)counts->inter_mode[i][j] *
+ (pre_fc->inter_mode_probs[i][j] - cur_fc->inter_mode_probs[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->inter_mode_probs[i][INTER_MODES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->inter_mode_probs[i][INTER_MODES - 2];
+
+ diff += (int)counts->inter_mode[i][INTER_MODES - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // intra_inter_prob
+ for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+ diff += (int)counts->intra_inter[i][0] *
+ (pre_fc->intra_inter_prob[i] - cur_fc->intra_inter_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->intra_inter_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->intra_inter_prob[i];
+
+ diff += (int)counts->intra_inter[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // comp_inter_prob
+ for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+ diff += (int)counts->comp_inter[i][0] *
+ (pre_fc->comp_inter_prob[i] - cur_fc->comp_inter_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->comp_inter_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->comp_inter_prob[i];
+
+ diff += (int)counts->comp_inter[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // single_ref_prob
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < 2; ++j) {
+ diff += (int)counts->single_ref[i][j][0] *
+ (pre_fc->single_ref_prob[i][j] - cur_fc->single_ref_prob[i][j]);
+
+ pre_last_prob = MAX_PROB - pre_fc->single_ref_prob[i][j];
+ cur_last_prob = MAX_PROB - cur_fc->single_ref_prob[i][j];
+
+ diff +=
+ (int)counts->single_ref[i][j][1] * (pre_last_prob - cur_last_prob);
+ }
+ }
+
+ // comp_ref_prob
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ diff += (int)counts->comp_ref[i][0] *
+ (pre_fc->comp_ref_prob[i] - cur_fc->comp_ref_prob[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->comp_ref_prob[i];
+ cur_last_prob = MAX_PROB - cur_fc->comp_ref_prob[i];
+
+ diff += (int)counts->comp_ref[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // tx_probs
+ for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
+ // p32x32
+ for (j = 0; j < TX_SIZES - 1; ++j) {
+ diff += (int)counts->tx.p32x32[i][j] *
+ (pre_fc->tx_probs.p32x32[i][j] - cur_fc->tx_probs.p32x32[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p32x32[i][TX_SIZES - 2];
+
+ diff += (int)counts->tx.p32x32[i][TX_SIZES - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // p16x16
+ for (j = 0; j < TX_SIZES - 2; ++j) {
+ diff += (int)counts->tx.p16x16[i][j] *
+ (pre_fc->tx_probs.p16x16[i][j] - cur_fc->tx_probs.p16x16[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p16x16[i][TX_SIZES - 3];
+
+ diff += (int)counts->tx.p16x16[i][TX_SIZES - 2] *
+ (pre_last_prob - cur_last_prob);
+
+ // p8x8
+ for (j = 0; j < TX_SIZES - 3; ++j) {
+ diff += (int)counts->tx.p8x8[i][j] *
+ (pre_fc->tx_probs.p8x8[i][j] - cur_fc->tx_probs.p8x8[i][j]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+ cur_last_prob = MAX_PROB - cur_fc->tx_probs.p8x8[i][TX_SIZES - 4];
+
+ diff +=
+ (int)counts->tx.p8x8[i][TX_SIZES - 3] * (pre_last_prob - cur_last_prob);
+ }
+
+ // skip_probs
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ diff += (int)counts->skip[i][0] *
+ (pre_fc->skip_probs[i] - cur_fc->skip_probs[i]);
+
+ pre_last_prob = MAX_PROB - pre_fc->skip_probs[i];
+ cur_last_prob = MAX_PROB - cur_fc->skip_probs[i];
+
+ diff += (int)counts->skip[i][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // mv
+ for (i = 0; i < MV_JOINTS - 1; ++i) {
+ diff += (int)counts->mv.joints[i] *
+ (pre_fc->nmvc.joints[i] - cur_fc->nmvc.joints[i]);
+ }
+ pre_last_prob = MAX_PROB - pre_fc->nmvc.joints[MV_JOINTS - 2];
+ cur_last_prob = MAX_PROB - cur_fc->nmvc.joints[MV_JOINTS - 2];
+
+ diff +=
+ (int)counts->mv.joints[MV_JOINTS - 1] * (pre_last_prob - cur_last_prob);
+
+ for (i = 0; i < 2; ++i) {
+ const nmv_component_counts *nmv_count = &counts->mv.comps[i];
+ const nmv_component *pre_nmv_prob = &pre_fc->nmvc.comps[i];
+ const nmv_component *cur_nmv_prob = &cur_fc->nmvc.comps[i];
+
+ // sign
+ diff += (int)nmv_count->sign[0] * (pre_nmv_prob->sign - cur_nmv_prob->sign);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->sign;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->sign;
+
+ diff += (int)nmv_count->sign[1] * (pre_last_prob - cur_last_prob);
+
+ // classes
+ for (j = 0; j < MV_CLASSES - 1; ++j) {
+ diff += (int)nmv_count->classes[j] *
+ (pre_nmv_prob->classes[j] - cur_nmv_prob->classes[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->classes[MV_CLASSES - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->classes[MV_CLASSES - 2];
+
+ diff += (int)nmv_count->classes[MV_CLASSES - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // class0
+ for (j = 0; j < CLASS0_SIZE - 1; ++j) {
+ diff += (int)nmv_count->class0[j] *
+ (pre_nmv_prob->class0[j] - cur_nmv_prob->class0[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0[CLASS0_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0[CLASS0_SIZE - 2];
+
+ diff += (int)nmv_count->class0[CLASS0_SIZE - 1] *
+ (pre_last_prob - cur_last_prob);
+
+ // bits
+ for (j = 0; j < MV_OFFSET_BITS; ++j) {
+ diff += (int)nmv_count->bits[j][0] *
+ (pre_nmv_prob->bits[j] - cur_nmv_prob->bits[j]);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->bits[j];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->bits[j];
+
+ diff += (int)nmv_count->bits[j][1] * (pre_last_prob - cur_last_prob);
+ }
+
+ // class0_fp
+ for (j = 0; j < CLASS0_SIZE; ++j) {
+ for (k = 0; k < MV_FP_SIZE - 1; ++k) {
+ diff += (int)nmv_count->class0_fp[j][k] *
+ (pre_nmv_prob->class0_fp[j][k] - cur_nmv_prob->class0_fp[j][k]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0_fp[j][MV_FP_SIZE - 2];
+
+ diff += (int)nmv_count->class0_fp[j][MV_FP_SIZE - 1] *
+ (pre_last_prob - cur_last_prob);
+ }
+
+ // fp
+ for (j = 0; j < MV_FP_SIZE - 1; ++j) {
+ diff +=
+ (int)nmv_count->fp[j] * (pre_nmv_prob->fp[j] - cur_nmv_prob->fp[j]);
+ }
+ pre_last_prob = MAX_PROB - pre_nmv_prob->fp[MV_FP_SIZE - 2];
+ cur_last_prob = MAX_PROB - cur_nmv_prob->fp[MV_FP_SIZE - 2];
+
+ diff +=
+ (int)nmv_count->fp[MV_FP_SIZE - 1] * (pre_last_prob - cur_last_prob);
+
+ // class0_hp
+ diff += (int)nmv_count->class0_hp[0] *
+ (pre_nmv_prob->class0_hp - cur_nmv_prob->class0_hp);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->class0_hp;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->class0_hp;
+
+ diff += (int)nmv_count->class0_hp[1] * (pre_last_prob - cur_last_prob);
+
+ // hp
+ diff += (int)nmv_count->hp[0] * (pre_nmv_prob->hp - cur_nmv_prob->hp);
+
+ pre_last_prob = MAX_PROB - pre_nmv_prob->hp;
+ cur_last_prob = MAX_PROB - cur_nmv_prob->hp;
+
+ diff += (int)nmv_count->hp[1] * (pre_last_prob - cur_last_prob);
+ }
+
+ return -diff;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Test for whether to calculate metrics for the frame.
+static int is_psnr_calc_enabled(const VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+ return cpi->b_calculate_psnr && (oxcf->pass != 1) && cm->show_frame;
+}
+
+/* clang-format off */
+const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
+ // sample rate size breadth bitrate cpb
+ { LEVEL_1, 829440, 36864, 512, 200, 400, 2, 1, 4, 8 },
+ { LEVEL_1_1, 2764800, 73728, 768, 800, 1000, 2, 1, 4, 8 },
+ { LEVEL_2, 4608000, 122880, 960, 1800, 1500, 2, 1, 4, 8 },
+ { LEVEL_2_1, 9216000, 245760, 1344, 3600, 2800, 2, 2, 4, 8 },
+ { LEVEL_3, 20736000, 552960, 2048, 7200, 6000, 2, 4, 4, 8 },
+ { LEVEL_3_1, 36864000, 983040, 2752, 12000, 10000, 2, 4, 4, 8 },
+ { LEVEL_4, 83558400, 2228224, 4160, 18000, 16000, 4, 4, 4, 8 },
+ { LEVEL_4_1, 160432128, 2228224, 4160, 30000, 18000, 4, 4, 5, 6 },
+ { LEVEL_5, 311951360, 8912896, 8384, 60000, 36000, 6, 8, 6, 4 },
+ { LEVEL_5_1, 588251136, 8912896, 8384, 120000, 46000, 8, 8, 10, 4 },
+ // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
+ // they are finalized (currently tentative).
+ { LEVEL_5_2, 1176502272, 8912896, 8384, 180000, 90000, 8, 8, 10, 4 },
+ { LEVEL_6, 1176502272, 35651584, 16832, 180000, 90000, 8, 16, 10, 4 },
+ { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+ { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
+};
+/* clang-format on */
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+ "The average bit-rate is too high.",
+ "The picture size is too large.",
+ "The picture width/height is too large.",
+ "The luma sample rate is too large.",
+ "The CPB size is too large.",
+ "The compression ratio is too small",
+ "Too many column tiles are used.",
+ "The alt-ref distance is too small.",
+ "Too many reference buffers are used."
+};
+
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
+ switch (mode) {
+ case VP8E_NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case VP8E_FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case VP8E_THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ default:
+ assert(mode == VP8E_ONETWO);
+ *hr = 1;
+ *hs = 2;
+ break;
+ }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(VP9_COMP *cpi) {
+ unsigned char *const seg_map = cpi->segmentation_map;
+
+ if (cpi->active_map.enabled || cpi->active_map.update) {
+ const int rows = cpi->common.mi_rows;
+ const int cols = cpi->common.mi_cols;
+ int i;
+
+ for (i = 0; i < rows * cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+ }
+}
+
+static void apply_active_map(VP9_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ vp9_enable_segmentation(seg);
+ vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+ // filter level being zero regardless of the value of seg->abs_delta.
+ vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF,
+ -MAX_LOOP_FILTER);
+ } else {
+ vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+static void apply_roi_map(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *const seg = &cm->seg;
+ vpx_roi_map_t *roi = &cpi->roi;
+ const int *delta_q = roi->delta_q;
+ const int *delta_lf = roi->delta_lf;
+ const int *skip = roi->skip;
+ int ref_frame[8];
+ int internal_delta_q[MAX_SEGMENTS];
+ int i;
+
+ // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+ // realtime mode.
+ if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+ if (!roi->enabled) return;
+
+ memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+ // Select delta coding method;
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ // Translate the external delta q values to internal values.
+ internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+ if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+ vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+ if (internal_delta_q[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+ }
+ if (delta_lf[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+ }
+ if (skip[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+ vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0);
+ }
+ if (ref_frame[i] >= 0) {
+ int valid_ref = 1;
+ // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+ if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+ valid_ref = 0;
+ // If GOLDEN is selected, make sure it's set as reference.
+ if (ref_frame[i] == GOLDEN_FRAME &&
+ !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
+ valid_ref = 0;
+ }
+ // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+ // same reference.
+ if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+ ref_frame[i] = LAST_FRAME;
+ if (valid_ref) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+ }
+ }
+ }
+ roi->enabled = 1;
+}
+
+static void init_level_info(Vp9LevelInfo *level_info) {
+ Vp9LevelStats *const level_stats = &level_info->level_stats;
+ Vp9LevelSpec *const level_spec = &level_info->level_spec;
+
+ memset(level_stats, 0, sizeof(*level_stats));
+ memset(level_spec, 0, sizeof(*level_spec));
+ level_spec->level = LEVEL_UNKNOWN;
+ level_spec->min_altref_distance = INT_MAX;
+}
+
+static int check_seg_range(int seg_data[8], int range) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ // Note abs() alone can't be used as the behavior of abs(INT_MIN) is
+ // undefined.
+ if (seg_data[i] > range || seg_data[i] < -range) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
+ int i;
+ const Vp9LevelSpec *this_level;
+
+ vpx_clear_system_state();
+
+ for (i = 0; i < VP9_LEVELS; ++i) {
+ this_level = &vp9_level_defs[i];
+ if ((double)level_spec->max_luma_sample_rate >
+ (double)this_level->max_luma_sample_rate *
+ (1 + SAMPLE_RATE_GRACE_P) ||
+ level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+ level_spec->max_luma_picture_breadth >
+ this_level->max_luma_picture_breadth ||
+ level_spec->average_bitrate > this_level->average_bitrate ||
+ level_spec->max_cpb_size > this_level->max_cpb_size ||
+ level_spec->compression_ratio < this_level->compression_ratio ||
+ level_spec->max_col_tiles > this_level->max_col_tiles ||
+ level_spec->min_altref_distance < this_level->min_altref_distance ||
+ level_spec->max_ref_frame_buffers > this_level->max_ref_frame_buffers)
+ continue;
+ break;
+ }
+ return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
+}
+
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]) {
+ VP9_COMMON *cm = &cpi->common;
+ vpx_roi_map_t *roi = &cpi->roi;
+ const int range = 63;
+ const int ref_frame_range = 3; // Alt-ref
+ const int skip_range = 1;
+ const int frame_rows = cpi->common.mi_rows;
+ const int frame_cols = cpi->common.mi_cols;
+
+ // Check number of rows and columns match
+ if (frame_rows != (int)rows || frame_cols != (int)cols) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+
+ if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+ !check_seg_range(ref_frame, ref_frame_range) ||
+ !check_seg_range(skip, skip_range))
+ return VPX_CODEC_INVALID_PARAM;
+
+ // Also disable segmentation if no deltas are specified.
+ if (!map ||
+ (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+ delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+ delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+ delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+ skip[5] | skip[6] | skip[7]) &&
+ (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+ ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+ ref_frame[6] == -1 && ref_frame[7] == -1))) {
+ vp9_disable_segmentation(&cm->seg);
+ cpi->roi.enabled = 0;
+ return VPX_CODEC_OK;
+ }
+
+ if (roi->roi_map) {
+ vpx_free(roi->roi_map);
+ roi->roi_map = NULL;
+ }
+ roi->roi_map = vpx_malloc(rows * cols);
+ if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
+
+ // Copy to ROI structure in the compressor.
+ memcpy(roi->roi_map, map, rows * cols);
+ memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+ memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+ memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+ memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+ roi->enabled = 1;
+ roi->rows = rows;
+ roi->cols = cols;
+
+ return VPX_CODEC_OK;
+}
+
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+ unsigned char *const active_map_8x8 = cpi->active_map.map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ cpi->active_map.update = 1;
+ if (new_map_16x16) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ active_map_8x8[r * mi_cols + c] =
+ new_map_16x16[(r >> 1) * cols + (c >> 1)]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ } else {
+ cpi->active_map.enabled = 0;
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+ new_map_16x16) {
+ unsigned char *const seg_map_8x8 = cpi->segmentation_map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+ seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
+ MACROBLOCK *const mb = &cpi->td.mb;
+ cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+ if (cpi->common.allow_high_precision_mv) {
+ mb->mvcost = mb->nmvcost_hp;
+ mb->mvsadcost = mb->nmvsadcost_hp;
+ } else {
+ mb->mvcost = mb->nmvcost;
+ mb->mvsadcost = mb->nmvsadcost;
+ }
+}
+
+static void setup_frame(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+ vp9_setup_past_independence(cm);
+ } else {
+ if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame;
+ }
+
+ // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF
+ // case. Need some further investigation on if we could apply this to single
+ // layer ARF case as well.
+ if (cpi->multi_layer_arf && !cpi->use_svc) {
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const int gf_group_index = gf_group->index;
+ const int boost_frame =
+ !cpi->rc.is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
+ // frame_context_idx Frame Type
+ // 0 Intra only frame, base layer ARF
+ // 1 ARFs with layer depth = 2,3
+ // 2 ARFs with layer depth > 3
+ // 3 Non-boosted frames
+ if (frame_is_intra_only(cm)) {
+ cm->frame_context_idx = 0;
+ } else if (boost_frame) {
+ if (gf_group->rf_level[gf_group_index] == GF_ARF_STD)
+ cm->frame_context_idx = 0;
+ else if (gf_group->layer_depth[gf_group_index] <= 3)
+ cm->frame_context_idx = 1;
+ else
+ cm->frame_context_idx = 2;
+ } else {
+ cm->frame_context_idx = 3;
+ }
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ vp9_zero(cpi->interp_filter_selected);
+ } else {
+ *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+ vp9_zero(cpi->interp_filter_selected[0]);
+ }
+}
+
+static void vp9_enc_setup_mi(VP9_COMMON *cm) {
+ int i;
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+ // Clear top border row
+ memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+ // Clear left border column
+ for (i = 1; i < cm->mi_rows + 1; ++i)
+ memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
+ cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip) return 1;
+ cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+ if (!cm->prev_mip) return 1;
+ cm->mi_alloc_size = mi_size;
+
+ cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+ if (!cm->mi_grid_base) return 1;
+ cm->prev_mi_grid_base =
+ (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+ if (!cm->prev_mi_grid_base) return 1;
+
+ return 0;
+}
+
+static void vp9_enc_free_mi(VP9_COMMON *cm) {
+ vpx_free(cm->mip);
+ cm->mip = NULL;
+ vpx_free(cm->prev_mip);
+ cm->prev_mip = NULL;
+ vpx_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+ vpx_free(cm->prev_mi_grid_base);
+ cm->prev_mi_grid_base = NULL;
+ cm->mi_alloc_size = 0;
+}
+
+static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MODE_INFO **temp_base = cm->prev_mi_grid_base;
+ MODE_INFO *temp = cm->prev_mip;
+
+ // Skip update prev_mi frame in show_existing_frame mode.
+ if (cm->show_existing_frame) return;
+
+ cm->prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+ cm->prev_mi_grid_base = cm->mi_grid_base;
+ cm->mi_grid_base = temp_base;
+ cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+static void initialize_enc(void) {
+ vp9_rtcd();
+ vpx_dsp_rtcd();
+ vpx_scale_rtcd();
+ vp9_init_intra_predictors();
+ vp9_init_me_luts();
+ vp9_rc_init_minq_luts();
+ vp9_entropy_mv_init();
+#if !CONFIG_REALTIME_ONLY
+ vp9_temporal_filter_init();
+#endif
+}
+
+void vp9_initialize_enc(void) { once(initialize_enc); }
+
+static void dealloc_compressor_data(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int i;
+
+ vpx_free(cpi->mbmi_ext_base);
+ cpi->mbmi_ext_base = NULL;
+
+ vpx_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+
+ vpx_free(cpi->segmentation_map);
+ cpi->segmentation_map = NULL;
+ vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+ cpi->coding_context.last_frame_seg_map_copy = NULL;
+
+ vpx_free(cpi->nmvcosts[0]);
+ vpx_free(cpi->nmvcosts[1]);
+ cpi->nmvcosts[0] = NULL;
+ cpi->nmvcosts[1] = NULL;
+
+ vpx_free(cpi->nmvcosts_hp[0]);
+ vpx_free(cpi->nmvcosts_hp[1]);
+ cpi->nmvcosts_hp[0] = NULL;
+ cpi->nmvcosts_hp[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts[0]);
+ vpx_free(cpi->nmvsadcosts[1]);
+ cpi->nmvsadcosts[0] = NULL;
+ cpi->nmvsadcosts[1] = NULL;
+
+ vpx_free(cpi->nmvsadcosts_hp[0]);
+ vpx_free(cpi->nmvsadcosts_hp[1]);
+ cpi->nmvsadcosts_hp[0] = NULL;
+ cpi->nmvsadcosts_hp[1] = NULL;
+
+ vpx_free(cpi->skin_map);
+ cpi->skin_map = NULL;
+
+ vpx_free(cpi->prev_partition);
+ cpi->prev_partition = NULL;
+
+ vpx_free(cpi->svc.prev_partition_svc);
+ cpi->svc.prev_partition_svc = NULL;
+
+ vpx_free(cpi->prev_segment_id);
+ cpi->prev_segment_id = NULL;
+
+ vpx_free(cpi->prev_variance_low);
+ cpi->prev_variance_low = NULL;
+
+ vpx_free(cpi->copied_frame_cnt);
+ cpi->copied_frame_cnt = NULL;
+
+ vpx_free(cpi->content_state_sb_fd);
+ cpi->content_state_sb_fd = NULL;
+
+ vpx_free(cpi->count_arf_frame_usage);
+ cpi->count_arf_frame_usage = NULL;
+ vpx_free(cpi->count_lastgolden_frame_usage);
+ cpi->count_lastgolden_frame_usage = NULL;
+
+ vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+ cpi->cyclic_refresh = NULL;
+
+ vpx_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ vpx_free(cpi->roi.roi_map);
+ cpi->roi.roi_map = NULL;
+
+ vpx_free(cpi->consec_zero_mv);
+ cpi->consec_zero_mv = NULL;
+
+ vpx_free(cpi->mb_wiener_variance);
+ cpi->mb_wiener_variance = NULL;
+
+ vpx_free(cpi->mi_ssim_rdmult_scaling_factors);
+ cpi->mi_ssim_rdmult_scaling_factors = NULL;
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ free_partition_info(cpi);
+ free_motion_vector_info(cpi);
+ free_fp_motion_vector_info(cpi);
+ free_tpl_stats_info(cpi);
+ }
+#endif
+
+ vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
+ vp9_free_context_buffers(cm);
+
+ vpx_free_frame_buffer(&cpi->last_frame_uf);
+ vpx_free_frame_buffer(&cpi->scaled_source);
+ vpx_free_frame_buffer(&cpi->scaled_last_source);
+ vpx_free_frame_buffer(&cpi->alt_ref_buffer);
+#ifdef ENABLE_KF_DENOISE
+ vpx_free_frame_buffer(&cpi->raw_unscaled_source);
+ vpx_free_frame_buffer(&cpi->raw_scaled_source);
+#endif
+
+ vp9_lookahead_destroy(cpi->lookahead);
+
+ vpx_free(cpi->tile_tok[0][0]);
+ cpi->tile_tok[0][0] = 0;
+
+ vpx_free(cpi->tplist[0][0]);
+ cpi->tplist[0][0] = NULL;
+
+ vp9_free_pc_tree(&cpi->td);
+
+ for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
+ LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
+ vpx_free(lc->rc_twopass_stats_in.buf);
+ lc->rc_twopass_stats_in.buf = NULL;
+ lc->rc_twopass_stats_in.sz = 0;
+ }
+
+ if (cpi->source_diff_var != NULL) {
+ vpx_free(cpi->source_diff_var);
+ cpi->source_diff_var = NULL;
+ }
+
+ for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
+ vpx_free_frame_buffer(&cpi->svc.scaled_frames[i]);
+ }
+ memset(&cpi->svc.scaled_frames[0], 0,
+ MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+
+ vpx_free_frame_buffer(&cpi->svc.scaled_temp);
+ memset(&cpi->svc.scaled_temp, 0, sizeof(cpi->svc.scaled_temp));
+
+ vpx_free_frame_buffer(&cpi->svc.empty_frame.img);
+ memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
+
+ vp9_free_svc_cyclic_refresh(cpi);
+}
+
+static void save_coding_context(VP9_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ VP9_COMMON *cm = &cpi->common;
+
+ // Stores a snapshot of key state variables which can subsequently be
+ // restored with a call to vp9_restore_coding_context. These functions are
+ // intended for use in a re-code loop in vp9_compress_frame where the
+ // quantizer value is adjusted between loop iterations.
+ vp9_copy(cc->nmvjointcost, cpi->td.mb.nmvjointcost);
+
+ memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+ MV_VALS * sizeof(*cpi->nmvcosts[0]));
+ memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+ MV_VALS * sizeof(*cpi->nmvcosts[1]));
+ memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+ memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+
+ vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
+
+ memcpy(cpi->coding_context.last_frame_seg_map_copy, cm->last_frame_seg_map,
+ (cm->mi_rows * cm->mi_cols));
+
+ vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
+ vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
+
+ cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(VP9_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ VP9_COMMON *cm = &cpi->common;
+
+ // Restore key state variables to the snapshot state stored in the
+ // previous call to vp9_save_coding_context.
+ vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+
+ memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+ memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+ memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+ memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+ MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+
+ vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
+
+ memcpy(cm->last_frame_seg_map, cpi->coding_context.last_frame_seg_map_copy,
+ (cm->mi_rows * cm->mi_cols));
+
+ vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
+ vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
+
+ *cm->fc = cc->fc;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void configure_static_seg_features(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ struct segmentation *const seg = &cm->seg;
+
+ int high_q = (int)(rc->avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation
+ vp9_disable_segmentation(seg);
+
+ // Clear down the segment features.
+ vp9_clearall_segfeatures(seg);
+ } else if (cpi->refresh_alt_ref_frame) {
+ // If this is an alt ref frame
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation and individual segment features by default
+ vp9_disable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+
+ // Scan frames from current to arf frame.
+ // This function re-enables segmentation if appropriate.
+ vp9_update_mbgraph_stats(cpi);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+
+ qi_delta =
+ vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, cm->bit_depth);
+ vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+ vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+
+ vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+ // Where relevant assume segment data is delta data
+ seg->abs_delta = SEGMENT_DELTADATA;
+ }
+ } else if (seg->enabled) {
+ // All other frames if segmentation has been enabled
+
+ // First normal frame in a valid gf or alt ref group
+ if (rc->frames_since_golden == 0) {
+ // Set up segment features for normal frames in an arf group
+ if (rc->source_alt_ref_active) {
+ seg->update_map = 0;
+ seg->update_data = 1;
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ qi_delta =
+ vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, cm->bit_depth);
+ vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+ vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
+
+ // Segment coding disabled for compred testing
+ if (high_q || (cpi->static_mb_pct == 100)) {
+ vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ } else {
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+
+ vp9_disable_segmentation(seg);
+
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ vp9_clearall_segfeatures(seg);
+ }
+ } else if (rc->is_src_frame_alt_ref) {
+ // Special case where we are coding over the top of a previous
+ // alt ref frame.
+ // Segment coding disabled for compred testing
+
+ // Enable ref frame features for segment 0 as well
+ vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+ // All mbs should use ALTREF_FRAME
+ vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+ // Skip all MBs if high Q (0,0 mv and skip coeffs)
+ if (high_q) {
+ vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ // Enable data update
+ seg->update_data = 1;
+ } else {
+ // All other frames.
+
+ // No updates.. leave things as they are.
+ seg->update_map = 0;
+ seg->update_data = 0;
+ }
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void update_reference_segmentation_map(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
+ uint8_t *cache_ptr = cm->last_frame_seg_map;
+ int row, col;
+
+ for (row = 0; row < cm->mi_rows; row++) {
+ MODE_INFO **mi_8x8 = mi_8x8_ptr;
+ uint8_t *cache = cache_ptr;
+ for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
+ cache[0] = mi_8x8[0]->segment_id;
+ mi_8x8_ptr += cm->mi_stride;
+ cache_ptr += cm->mi_cols;
+ }
+}
+
+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
+ if (!cpi->lookahead)
+ cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ oxcf->lag_in_frames);
+ if (!cpi->lookahead)
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+
+ // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+ if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (vpx_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ if (vpx_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+ // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
+ // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
+ // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+ if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+ cpi->svc.number_spatial_layers > 2) {
+ cpi->svc.scaled_temp_is_alloc = 1;
+ if (vpx_realloc_frame_buffer(
+ &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled_frame for svc ");
+ }
+
+ if (vpx_realloc_frame_buffer(&cpi->scaled_last_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled last source buffer");
+#ifdef ENABLE_KF_DENOISE
+ if (vpx_realloc_frame_buffer(&cpi->raw_unscaled_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate unscaled raw source frame buffer");
+
+ if (vpx_realloc_frame_buffer(&cpi->raw_scaled_source, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate scaled raw source frame buffer");
+#endif
+}
+
+static void alloc_context_buffers_ext(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int mi_size = cm->mi_cols * cm->mi_rows;
+
+ CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
+ vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+}
+
+static void alloc_compressor_data(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int sb_rows;
+
+ if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ alloc_context_buffers_ext(cpi);
+
+ vpx_free(cpi->tile_tok[0][0]);
+
+ {
+ unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
+ vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+ }
+
+ sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ vpx_free(cpi->tplist[0][0]);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tplist[0][0],
+ vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
+
+ vp9_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
+ vp9_rc_update_framerate(cpi);
+}
+
+static void set_tile_limits(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ int min_log2_tile_cols, max_log2_tile_cols;
+ vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+
+ cm->log2_tile_cols =
+ clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+ cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+ if (cpi->oxcf.target_level == LEVEL_AUTO) {
+ const int level_tile_cols =
+ log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+ if (cm->log2_tile_cols > level_tile_cols) {
+ cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+ }
+ }
+}
+
+static void update_frame_size(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ vp9_set_mb_mi(cm, cm->width, cm->height);
+ vp9_init_context_buffers(cm);
+ vp9_init_macroblockd(cm, xd, NULL);
+ cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
+ memset(cpi->mbmi_ext_base, 0,
+ cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+
+ set_tile_limits(cpi);
+}
+
+static void init_buffer_indices(VP9_COMP *cpi) {
+ int ref_frame;
+
+ for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+ cpi->ref_fb_idx[ref_frame] = ref_frame;
+
+ cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1];
+ cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+ cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+}
+
+static void init_level_constraint(LevelConstraint *lc) {
+ lc->level_index = -1;
+ lc->max_cpb_size = INT_MAX;
+ lc->max_frame_size = INT_MAX;
+ lc->fail_flag = 0;
+}
+
+static void set_level_constraint(LevelConstraint *ls, int8_t level_index) {
+ vpx_clear_system_state();
+ ls->level_index = level_index;
+ if (level_index >= 0) {
+ ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000;
+ }
+}
+
+static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->init_framerate;
+ cm->profile = oxcf->profile;
+ cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
+ cm->color_space = oxcf->color_space;
+ cm->color_range = oxcf->color_range;
+
+ cpi->target_level = oxcf->target_level;
+ cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+ set_level_constraint(&cpi->level_constraint,
+ get_level_index(cpi->target_level));
+
+ cm->width = oxcf->width;
+ cm->height = oxcf->height;
+ alloc_compressor_data(cpi);
+
+ cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cm->counts;
+
+ // Spatial scalability.
+ cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
+ // Temporal scalability.
+ cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
+
+ if ((cpi->svc.number_temporal_layers > 1) ||
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass != 1)) {
+ vp9_init_layer_context(cpi);
+ }
+
+ // change includes all joint functionality
+ vp9_change_config(cpi, oxcf);
+
+ cpi->static_mb_pct = 0;
+ cpi->ref_frame_flags = 0;
+
+ init_buffer_indices(cpi);
+
+ vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+ cpi->fixed_qp_onepass = 0;
+}
+
+void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
+ RATE_CONTROL *rc = &cpi->rc;
+
+ if (cpi->common.current_video_frame >
+ (unsigned int)cpi->svc.number_spatial_layers) {
+ if (cpi->use_svc) {
+ vp9_svc_check_reset_layer_rc_flag(cpi);
+ } else {
+ if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
+ rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->buffer_level = rc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
+ RATE_CONTROL *rc = &cpi->rc;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
+ const int64_t bandwidth = oxcf->target_bandwidth;
+ const int64_t starting = oxcf->starting_buffer_level_ms;
+ const int64_t optimal = oxcf->optimal_buffer_level_ms;
+ const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+ rc->starting_buffer_level = starting * bandwidth / 1000;
+ rc->optimal_buffer_level =
+ (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+ rc->maximum_buffer_size =
+ (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = VPXMIN(rc->buffer_level, rc->maximum_buffer_size);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+ }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 4; \
+ }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
+
+static void highbd_set_var_fns(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+ vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+ vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+ vpx_highbd_8_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+ vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+ vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+ vpx_highbd_8_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+ vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+ vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+ vpx_highbd_8_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+ vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+ vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+ vpx_highbd_8_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+ vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+ vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+ vpx_highbd_8_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+ vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+ vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+ vpx_highbd_8_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+ vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+ vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+ vpx_highbd_8_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+ vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+ vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+ vpx_highbd_8_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+ vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+ vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+ vpx_highbd_8_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+ vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+ vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+ vpx_highbd_8_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+ vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+ vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+ vpx_highbd_8_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+ vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+ vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+ vpx_highbd_8_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+ HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+ vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+ vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+ vpx_highbd_8_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
+ break;
+
+ case VPX_BITS_10:
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+ vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+ vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+ vpx_highbd_10_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+ vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+ vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+ vpx_highbd_10_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+ vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+ vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+ vpx_highbd_10_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+ vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+ vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+ vpx_highbd_10_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+ vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+ vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+ vpx_highbd_10_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+ vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+ vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+ vpx_highbd_10_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+ vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+ vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+ vpx_highbd_10_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+ vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+ vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+ vpx_highbd_10_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+ vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+ vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+ vpx_highbd_10_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+ vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+ vpx_highbd_10_sub_pixel_variance8x8,
+ vpx_highbd_10_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+ vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+ vpx_highbd_10_sub_pixel_variance8x4,
+ vpx_highbd_10_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+ vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+ vpx_highbd_10_sub_pixel_variance4x8,
+ vpx_highbd_10_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+ vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+ vpx_highbd_10_sub_pixel_variance4x4,
+ vpx_highbd_10_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
+ break;
+
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ HIGHBD_BFP(
+ BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+ vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+ vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+ vpx_highbd_12_sub_pixel_avg_variance32x16,
+ vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+ vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+ vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+ vpx_highbd_12_sub_pixel_avg_variance16x32,
+ vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+ vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+ vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+ vpx_highbd_12_sub_pixel_avg_variance64x32,
+ vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+ vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+ vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+ vpx_highbd_12_sub_pixel_avg_variance32x64,
+ vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+ vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+ vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+ vpx_highbd_12_sub_pixel_avg_variance32x32,
+ vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+ vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+ vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+ vpx_highbd_12_sub_pixel_avg_variance64x64,
+ vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+ vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+ vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+ vpx_highbd_12_sub_pixel_avg_variance16x16,
+ vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+ vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+ vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+ vpx_highbd_12_sub_pixel_avg_variance16x8,
+ vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+ vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+ vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+ vpx_highbd_12_sub_pixel_avg_variance8x16,
+ vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+ vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+ vpx_highbd_12_sub_pixel_variance8x8,
+ vpx_highbd_12_sub_pixel_avg_variance8x8,
+ vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+ vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+ vpx_highbd_12_sub_pixel_variance8x4,
+ vpx_highbd_12_sub_pixel_avg_variance8x4,
+ vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+ vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+ vpx_highbd_12_sub_pixel_variance4x8,
+ vpx_highbd_12_sub_pixel_avg_variance4x8,
+ vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+ HIGHBD_BFP(
+ BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+ vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+ vpx_highbd_12_sub_pixel_variance4x4,
+ vpx_highbd_12_sub_pixel_avg_variance4x4,
+ vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
+ break;
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void realloc_segmentation_maps(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ // Create the encoder segmentation map and set all entries to 0
+ vpx_free(cpi->segmentation_map);
+ CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
+ vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ vpx_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // And a place holder structure is the coding context
+ // for use if we want to save and restore it
+ vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+ CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+static void alloc_copy_partition_data(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cpi->prev_partition == NULL) {
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
+ (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+ sizeof(*cpi->prev_partition)));
+ }
+ if (cpi->prev_segment_id == NULL) {
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->prev_segment_id,
+ (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->prev_segment_id)));
+ }
+ if (cpi->prev_variance_low == NULL) {
+ CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
+ (uint8_t *)vpx_calloc(
+ (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
+ sizeof(*cpi->prev_variance_low)));
+ }
+ if (cpi->copied_frame_cnt == NULL) {
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->copied_frame_cnt,
+ (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->copied_frame_cnt)));
+ }
+}
+
+void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int last_w = cpi->oxcf.width;
+ int last_h = cpi->oxcf.height;
+
+ vp9_init_quantizer(cpi);
+ if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
+ cm->bit_depth = oxcf->bit_depth;
+ cm->color_space = oxcf->color_space;
+ cm->color_range = oxcf->color_range;
+
+ cpi->target_level = oxcf->target_level;
+ cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX;
+ set_level_constraint(&cpi->level_constraint,
+ get_level_index(cpi->target_level));
+
+ if (cm->profile <= PROFILE_1)
+ assert(cm->bit_depth == VPX_BITS_8);
+ else
+ assert(cm->bit_depth > VPX_BITS_8);
+
+ cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+ rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+ }
+
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_last_frame = 1;
+ cm->refresh_frame_context = 1;
+ cm->reset_frame_context = 0;
+
+ vp9_reset_segment_features(&cm->seg);
+ vp9_set_high_precision_mv(cpi, 0);
+
+ {
+ int i;
+
+ for (i = 0; i < MAX_SEGMENTS; i++)
+ cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+ }
+ cpi->encode_breakout = cpi->oxcf.encode_breakout;
+
+ vp9_set_rc_buffer_sizes(cpi);
+
+ // Set up frame rate and related parameters rate control values.
+ vp9_new_framerate(cpi, cpi->framerate);
+
+ // Set absolute upper and lower quality limits
+ rc->worst_quality = cpi->oxcf.worst_allowed_q;
+ rc->best_quality = cpi->oxcf.best_allowed_q;
+
+ cm->interp_filter = cpi->sf.default_interp_filter;
+
+ if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+ cm->render_width = cpi->oxcf.render_width;
+ cm->render_height = cpi->oxcf.render_height;
+ } else {
+ cm->render_width = cpi->oxcf.width;
+ cm->render_height = cpi->oxcf.height;
+ }
+ if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+ cm->width = cpi->oxcf.width;
+ cm->height = cpi->oxcf.height;
+ cpi->external_resize = 1;
+ }
+
+ if (cpi->initial_width) {
+ int new_mi_size = 0;
+ vp9_set_mb_mi(cm, cm->width, cm->height);
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ vp9_free_context_buffers(cm);
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ cpi->external_resize = 0;
+ } else if (cm->mi_alloc_size == new_mi_size &&
+ (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
+ if (vp9_alloc_loop_filter(cm)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate loop filter data");
+ }
+ }
+ }
+
+ if (cm->current_video_frame == 0 || last_w != cpi->oxcf.width ||
+ last_h != cpi->oxcf.height)
+ update_frame_size(cpi);
+
+ if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
+ memset(cpi->consec_zero_mv, 0,
+ cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_reset_resize(cpi);
+ rc->rc_1_frame = 0;
+ rc->rc_2_frame = 0;
+ }
+
+ if ((cpi->svc.number_temporal_layers > 1) ||
+ ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ cpi->oxcf.pass != 1)) {
+ vp9_update_layer_context_change_config(cpi,
+ (int)cpi->oxcf.target_bandwidth);
+ }
+
+ vp9_check_reset_rc_flag(cpi);
+
+ cpi->alt_ref_source = NULL;
+ rc->is_src_frame_alt_ref = 0;
+
+#if 0
+ // Experimental RD Code
+ cpi->frame_distortion = 0;
+ cpi->last_frame_distortion = 0;
+#endif
+
+ set_tile_limits(cpi);
+
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->ext_refresh_frame_context_pending = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
+ vp9_set_row_mt(cpi);
+}
+
+/***********************************************************************
+ * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' *
+ ***********************************************************************
+ * The following 2 functions ('cal_nmvjointsadcost' and *
+ * 'cal_nmvsadcosts') are used to calculate cost lookup tables *
+ * used by 'vp9_diamond_search_sad'. The C implementation of the *
+ * function is generic, but the AVX intrinsics optimised version *
+ * relies on the following properties of the computed tables: *
+ * For cal_nmvjointsadcost: *
+ * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
+ * For cal_nmvsadcosts: *
+ * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
+ * (Equal costs for both components) *
+ * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
+ * (Cost function is even) *
+ * If these do not hold, then the AVX optimised version of the *
+ * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
+ * case you can revert to using the C function instead. *
+ ***********************************************************************/
+
+static void cal_nmvjointsadcost(int *mvjointsadcost) {
+ /*********************************************************************
+ * Warning: Read the comments above before modifying this function *
+ *********************************************************************/
+ mvjointsadcost[0] = 600;
+ mvjointsadcost[1] = 300;
+ mvjointsadcost[2] = 300;
+ mvjointsadcost[3] = 300;
+}
+
+static void cal_nmvsadcosts(int *mvsadcost[2]) {
+ /*********************************************************************
+ * Warning: Read the comments above before modifying this function *
+ *********************************************************************/
+ int i = 1;
+
+ mvsadcost[0][0] = 0;
+ mvsadcost[1][0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost[0][i] = (int)z;
+ mvsadcost[1][i] = (int)z;
+ mvsadcost[0][-i] = (int)z;
+ mvsadcost[1][-i] = (int)z;
+ } while (++i <= MV_MAX);
+}
+
+static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
+ int i = 1;
+
+ mvsadcost[0][0] = 0;
+ mvsadcost[1][0] = 0;
+
+ do {
+ double z = 256 * (2 * (log2f(8 * i) + .6));
+ mvsadcost[0][i] = (int)z;
+ mvsadcost[1][i] = (int)z;
+ mvsadcost[0][-i] = (int)z;
+ mvsadcost[1][-i] = (int)z;
+ } while (++i <= MV_MAX);
+}
+
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+ int i;
+ BufferPool *const pool = cm->buffer_pool;
+ cm->new_fb_idx = INVALID_IDX;
+ for (i = 0; i < REF_FRAMES; ++i) {
+ cm->ref_frame_map[i] = INVALID_IDX;
+ }
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ pool->frame_bufs[i].ref_count = 0;
+ }
+}
+
+static void update_initial_width(VP9_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y) {
+ VP9_COMMON *const cm = &cpi->common;
+#if !CONFIG_VP9_HIGHBITDEPTH
+ (void)use_highbitdepth;
+ assert(use_highbitdepth == 0);
+#endif
+
+ if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth != use_highbitdepth ||
+#endif
+ cm->subsampling_x != subsampling_x ||
+ cm->subsampling_y != subsampling_y) {
+ cm->subsampling_x = subsampling_x;
+ cm->subsampling_y = subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = use_highbitdepth;
+#endif
+ alloc_util_frame_buffers(cpi);
+ cpi->initial_width = cm->width;
+ cpi->initial_height = cm->height;
+ cpi->initial_mbs = cm->MBs;
+ }
+}
+
+// TODO(angiebird): Check whether we can move this function to vpx_image.c
+static INLINE void vpx_img_chroma_subsampling(vpx_img_fmt_t fmt,
+ unsigned int *subsampling_x,
+ unsigned int *subsampling_y) {
+ switch (fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_NV12:
+ case VPX_IMG_FMT_I422:
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I42216: *subsampling_x = 1; break;
+ default: *subsampling_x = 0; break;
+ }
+
+ switch (fmt) {
+ case VPX_IMG_FMT_I420:
+ case VPX_IMG_FMT_I440:
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_NV12:
+ case VPX_IMG_FMT_I42016:
+ case VPX_IMG_FMT_I44016: *subsampling_y = 1; break;
+ default: *subsampling_y = 0; break;
+ }
+}
+
+// TODO(angiebird): Check whether we can move this function to vpx_image.c
+static INLINE int vpx_img_use_highbitdepth(vpx_img_fmt_t fmt) {
+ return fmt & VPX_IMG_FMT_HIGHBITDEPTH;
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (cpi->oxcf.noise_sensitivity > 0 &&
+ !cpi->denoiser.frame_buffer_initialized) {
+ if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+ cpi->oxcf.noise_sensitivity, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate denoiser");
+ }
+}
+#endif
+
+void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) {
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ unsigned int subsampling_x, subsampling_y;
+ const int use_highbitdepth = vpx_img_use_highbitdepth(img_fmt);
+ vpx_img_chroma_subsampling(img_fmt, &subsampling_x, &subsampling_y);
+
+ update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+ assert(cpi->lookahead == NULL);
+ cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height, subsampling_x,
+ subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ oxcf->lag_in_frames);
+ alloc_raw_frame_buffers(cpi);
+}
+
+VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
+ BufferPool *const pool) {
+ unsigned int i;
+ VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+ VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+ if (!cm) return NULL;
+
+ vp9_zero(*cpi);
+
+ if (setjmp(cm->error.jmp)) {
+ cm->error.setjmp = 0;
+ vp9_remove_compressor(cpi);
+ return 0;
+ }
+
+ cm->error.setjmp = 1;
+ cm->alloc_mi = vp9_enc_alloc_mi;
+ cm->free_mi = vp9_enc_free_mi;
+ cm->setup_mi = vp9_enc_setup_mi;
+
+ CHECK_MEM_ERROR(&cm->error, cm->fc,
+ (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(
+ &cm->error, cm->frame_contexts,
+ (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
+
+ cpi->compute_frame_low_motion_onepass = 1;
+ cpi->use_svc = 0;
+ cpi->resize_state = ORIG;
+ cpi->external_resize = 0;
+ cpi->resize_avg_qp = 0;
+ cpi->resize_buffer_underflow = 0;
+ cpi->use_skin_detection = 0;
+ cpi->common.buffer_pool = pool;
+ init_ref_frame_bufs(cm);
+
+ cpi->force_update_segmentation = 0;
+
+ init_config(cpi, oxcf);
+ cpi->frame_info = vp9_get_frame_info(oxcf);
+
+ vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+ vp9_init_rd_parameters(cpi);
+
+ init_frame_indexes(cm);
+ cpi->tile_data = NULL;
+
+ realloc_segmentation_maps(cpi);
+
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->skin_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+
+#if !CONFIG_REALTIME_ONLY
+ CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+#endif
+
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->consec_zero_mv,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
+
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
+ CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
+ vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
+
+ for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
+ i++) {
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->mbgraph_stats[i].mb_stats,
+ vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+ }
+
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+
+ init_level_info(&cpi->level_info);
+ init_level_constraint(&cpi->level_constraint);
+
+#if CONFIG_INTERNAL_STATS
+ cpi->b_calculate_blockiness = 1;
+ cpi->b_calculate_consistency = 1;
+ cpi->total_inconsistency = 0;
+ cpi->psnr.worst = 100.0;
+ cpi->worst_ssim = 100.0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr) {
+ cpi->total_sq_error = 0;
+ cpi->total_samples = 0;
+
+ cpi->totalp_sq_error = 0;
+ cpi->totalp_samples = 0;
+
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ cpi->summedp_quality = 0;
+ cpi->summedp_weights = 0;
+ }
+
+ cpi->fastssim.worst = 100.0;
+
+ cpi->psnrhvs.worst = 100.0;
+
+ if (cpi->b_calculate_blockiness) {
+ cpi->total_blockiness = 0;
+ cpi->worst_blockiness = 0.0;
+ }
+
+ if (cpi->b_calculate_consistency) {
+ CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
+ vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+ sizeof(*cpi->ssim_vars) * 4));
+ cpi->worst_consistency = 100.0;
+ } else {
+ cpi->ssim_vars = NULL;
+ }
+
+#endif
+
+ cpi->first_time_stamp_ever = INT64_MAX;
+
+ /*********************************************************************
+ * Warning: Read the comments around 'cal_nmvjointsadcost' and *
+ * 'cal_nmvsadcosts' before modifying how these tables are computed. *
+ *********************************************************************/
+ cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+ cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+ cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+ cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+ cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+ cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
+
+ cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+ cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+ cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+ cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+ yuv_denoised_file = fopen("denoised.yuv", "ab");
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_SVC_SRC
+ yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb");
+ yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb");
+ yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb");
+#endif
+
+#if 0
+ framepsnr = fopen("framepsnr.stt", "a");
+ kf_list = fopen("kf_list.stt", "w");
+#endif
+
+ cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+
+ {
+ vpx_codec_err_t codec_status = vp9_extrc_init(&cpi->ext_ratectrl);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status, "vp9_extrc_init() failed");
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ if (oxcf->pass == 1) {
+ vp9_init_first_pass(cpi);
+ } else if (oxcf->pass == 2) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+ if (cpi->svc.number_spatial_layers > 1 ||
+ cpi->svc.number_temporal_layers > 1) {
+ FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
+ FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
+ int n;
+
+ for (n = 0; n < oxcf->ss_number_layers; ++n) {
+ FIRSTPASS_STATS *const last_packet_for_layer =
+ &stats[packets - oxcf->ss_number_layers + n];
+ const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
+ const int packets_in_layer = (int)last_packet_for_layer->count + 1;
+ if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
+ int num_frames;
+ LAYER_CONTEXT *const lc = &cpi->svc.layer_context[layer_id];
+
+ vpx_free(lc->rc_twopass_stats_in.buf);
+
+ lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
+ CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
+ vpx_malloc(lc->rc_twopass_stats_in.sz));
+ lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
+ lc->twopass.stats_in = lc->twopass.stats_in_start;
+ lc->twopass.stats_in_end =
+ lc->twopass.stats_in_start + packets_in_layer - 1;
+ // Note the last packet is cumulative first pass stats.
+ // So the number of frames is packet number minus one
+ num_frames = packets_in_layer - 1;
+ fps_init_first_pass_info(&lc->twopass.first_pass_info,
+ lc->rc_twopass_stats_in.buf, num_frames);
+ stats_copy[layer_id] = lc->rc_twopass_stats_in.buf;
+ }
+ }
+
+ for (n = 0; n < packets; ++n) {
+ const int layer_id = (int)stats[n].spatial_layer_id;
+ if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
+ stats_copy[layer_id] != NULL) {
+ *stats_copy[layer_id] = stats[n];
+ ++stats_copy[layer_id];
+ }
+ }
+
+ vp9_init_second_pass_spatial_svc(cpi);
+ } else {
+ int num_frames;
+
+ cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+ cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+ cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+ // Note the last packet is cumulative first pass stats.
+ // So the number of frames is packet number minus one
+ num_frames = packets - 1;
+ fps_init_first_pass_info(&cpi->twopass.first_pass_info,
+ oxcf->two_pass_stats_in.buf, num_frames);
+
+ vp9_init_second_pass(cpi);
+ }
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ cpi->mb_wiener_var_cols = 0;
+ cpi->mb_wiener_var_rows = 0;
+ cpi->mb_wiener_variance = NULL;
+
+ vp9_set_speed_features_framesize_independent(cpi, oxcf->speed);
+ vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+ {
+ const int bsize = BLOCK_16X16;
+ const int w = num_8x8_blocks_wide_lookup[bsize];
+ const int h = num_8x8_blocks_high_lookup[bsize];
+ const int num_cols = (cm->mi_cols + w - 1) / w;
+ const int num_rows = (cm->mi_rows + h - 1) / h;
+ CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
+ vpx_calloc(num_rows * num_cols,
+ sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
+ }
+
+ cpi->kmeans_data_arr_alloc = 0;
+#if CONFIG_NON_GREEDY_MV
+ cpi->tpl_ready = 0;
+#endif // CONFIG_NON_GREEDY_MV
+ for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+ cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+ }
+
+ // Allocate memory to store variances for a frame.
+ CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+ vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
+ cpi->source_var_thresh = 0;
+ cpi->frames_till_next_var_check = 0;
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdsf = SDSF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+ BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+ vpx_variance32x16, vpx_sub_pixel_variance32x16,
+ vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+ BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+ vpx_variance16x32, vpx_sub_pixel_variance16x32,
+ vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+ BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+ vpx_variance64x32, vpx_sub_pixel_variance64x32,
+ vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+ BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+ vpx_variance32x64, vpx_sub_pixel_variance32x64,
+ vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+ BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+ vpx_variance32x32, vpx_sub_pixel_variance32x32,
+ vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+ BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+ vpx_variance64x64, vpx_sub_pixel_variance64x64,
+ vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+ BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+ vpx_variance16x16, vpx_sub_pixel_variance16x16,
+ vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+ BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+ vpx_variance16x8, vpx_sub_pixel_variance16x8,
+ vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+ BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+ vpx_variance8x16, vpx_sub_pixel_variance8x16,
+ vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+ BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+ vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+ vpx_sad_skip_8x8x4d)
+
+ BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+ vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+ vpx_sad_skip_8x4x4d)
+
+ BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+ vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+ vpx_sad_skip_4x8x4d)
+
+ BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+ vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+ vpx_sad_skip_4x4x4d)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ highbd_set_var_fns(cpi);
+#endif
+
+ /* vp9_init_quantizer() is first called here. Add check in
+ * vp9_frame_init_quantizer() so that vp9_init_quantizer is only
+ * called later when needed. This will avoid unnecessary calls of
+ * vp9_init_quantizer() for every frame.
+ */
+ vp9_init_quantizer(cpi);
+
+ vp9_loop_filter_init(cm);
+
+ // Set up the unit scaling factor used during motion search.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+ cm->width, cm->height,
+ cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+ cm->width, cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ cpi->td.mb.me_sf = &cpi->me_sf;
+
+ cm->error.setjmp = 0;
+
+#if CONFIG_RATE_CTRL
+ encode_command_init(&cpi->encode_command);
+ if (oxcf->use_simple_encode_api) {
+ partition_info_init(cpi);
+ motion_vector_info_init(cpi);
+ fp_motion_vector_info_init(cpi);
+ tpl_stats_info_init(cpi);
+ }
+#endif
+
+ return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif // CONFIG_INTERNAL_STATS
+
+void vp9_remove_compressor(VP9_COMP *cpi) {
+ VP9_COMMON *cm;
+ unsigned int i;
+
+ if (!cpi) return;
+
+#if CONFIG_INTERNAL_STATS
+ vpx_free(cpi->ssim_vars);
+#endif
+
+ cm = &cpi->common;
+ if (cm->current_video_frame > 0) {
+#if CONFIG_INTERNAL_STATS
+ vpx_clear_system_state();
+
+ if (cpi->oxcf.pass != 1) {
+ char headings[512] = { 0 };
+ char results[512] = { 0 };
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded =
+ (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+ 10000000.000;
+ double total_encode_time =
+ (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+ const double dr =
+ (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+ const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+ const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+ const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+ if (cpi->b_calculate_psnr) {
+ const double total_psnr = vpx_sse_to_psnr(
+ (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+ const double totalp_psnr = vpx_sse_to_psnr(
+ (double)cpi->totalp_samples, peak, (double)cpi->totalp_sq_error);
+ const double total_ssim =
+ 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+ const double totalp_ssim =
+ 100 * pow(cpi->summedp_quality / cpi->summedp_weights, 8.0);
+
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+ "AVPsnrY\tAPsnrCb\tAPsnrCr");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f",
+ dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+ cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim,
+ totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count,
+ cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
+ cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+ cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count,
+ cpi->psnr.stat[V] / cpi->count);
+
+ if (cpi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+ }
+
+ if (cpi->b_calculate_consistency) {
+ double consistency =
+ vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+ (double)cpi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+ }
+
+ SNPRINT(headings, "\t Time\tRcErr\tAbsErr");
+ SNPRINT2(results, "\t%8.0f", total_encode_time);
+ SNPRINT2(results, "\t%7.2f", rate_err);
+ SNPRINT2(results, "\t%7.2f", fabs(rate_err));
+
+ fprintf(f, "%s\tAPsnr611\n", headings);
+ fprintf(
+ f, "%s\t%7.3f\n", results,
+ (6 * cpi->psnr.stat[Y] + cpi->psnr.stat[U] + cpi->psnr.stat[V]) /
+ (cpi->count * 8));
+ }
+
+ fclose(f);
+ }
+#endif
+
+#if 0
+ {
+ printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+ printf("\n_frames recive_data encod_mb_row compress_frame Total\n");
+ printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+ cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+ cpi->time_compress_data / 1000,
+ (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+ }
+#endif
+ }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ vp9_denoiser_free(&(cpi->denoiser));
+#endif
+
+ if (cpi->kmeans_data_arr_alloc) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&cpi->kmeans_mutex);
+#endif
+ vpx_free(cpi->kmeans_data_arr);
+ }
+
+ vp9_free_tpl_buffer(cpi);
+
+ vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+ vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_encode_free_mt_data(cpi);
+
+#if !CONFIG_REALTIME_ONLY
+ vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
+#endif
+
+ dealloc_compressor_data(cpi);
+
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
+ ++i) {
+ vpx_free(cpi->mbgraph_stats[i].mb_stats);
+ }
+
+ vp9_extrc_delete(&cpi->ext_ratectrl);
+
+ // Help detect use after free of the error detail string.
+ memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+ cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
+
+ vp9_remove_common(cm);
+ vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+ vp9_free_postproc_buffers(cm);
+#endif
+ vpx_free(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+ fclose(yuv_denoised_file);
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+#ifdef OUTPUT_YUV_SVC_SRC
+ fclose(yuv_svc_src[0]);
+ fclose(yuv_svc_src[1]);
+ fclose(yuv_svc_src[2]);
+#endif
+
+#if 0
+
+ if (keyfile)
+ fclose(keyfile);
+
+ if (framepsnr)
+ fclose(framepsnr);
+
+ if (kf_list)
+ fclose(kf_list);
+
+#endif
+}
+
+int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr) {
+ if (is_psnr_calc_enabled(cpi)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_calc_highbd_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr,
+ cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+#else
+ vpx_calc_psnr(cpi->raw_source_frame, cpi->common.frame_to_show, psnr);
+#endif
+ return 1;
+ } else {
+ vp9_zero(*psnr);
+ return 0;
+ }
+}
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
+ if (ref_frame_flags > 7) return -1;
+
+ cpi->ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
+ cpi->ext_refresh_golden_frame = (ref_frame_flags & VP9_GOLD_FLAG) != 0;
+ cpi->ext_refresh_alt_ref_frame = (ref_frame_flags & VP9_ALT_FLAG) != 0;
+ cpi->ext_refresh_last_frame = (ref_frame_flags & VP9_LAST_FLAG) != 0;
+ cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
+ VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
+ MV_REFERENCE_FRAME ref_frame = NONE;
+ if (ref_frame_flag == VP9_LAST_FLAG)
+ ref_frame = LAST_FRAME;
+ else if (ref_frame_flag == VP9_GOLD_FLAG)
+ ref_frame = GOLDEN_FRAME;
+ else if (ref_frame_flag == VP9_ALT_FLAG)
+ ref_frame = ALTREF_FRAME;
+
+ return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+}
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+ if (cfg) {
+ vpx_yv12_copy_frame(cfg, sd);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd) {
+ YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
+ if (cfg) {
+ vpx_yv12_copy_frame(sd, cfg);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int vp9_update_entropy(VP9_COMP *cpi, int update) {
+ cpi->ext_refresh_frame_context = update;
+ cpi->ext_refresh_frame_context_pending = 1;
+ return 0;
+}
+
+#ifdef OUTPUT_YUV_REC
+void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
+ YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+ uint8_t *src = s->y_buffer;
+ int h = cm->height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+}
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ int bd) {
+#else
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
+ int i;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ const int src_widths[3] = { src->y_crop_width, src->uv_crop_width,
+ src->uv_crop_width };
+ const int src_heights[3] = { src->y_crop_height, src->uv_crop_height,
+ src->uv_crop_height };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ const int dst_widths[3] = { dst->y_crop_width, dst->uv_crop_width,
+ dst->uv_crop_width };
+ const int dst_heights[3] = { dst->y_crop_height, dst->uv_crop_height,
+ dst->uv_crop_height };
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_resize_plane(srcs[i], src_heights[i], src_widths[i],
+ src_strides[i], dsts[i], dst_heights[i],
+ dst_widths[i], dst_strides[i], bd);
+ } else {
+ vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+ dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+ }
+#else
+ vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+ dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ vpx_extend_frame_borders(dst);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ INTERP_FILTER filter_type,
+ int phase_scaler) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
+ int x, y, i;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int factor = (i == 0 || i == 3 ? 1 : 2);
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (y = 0; y < dst_h; y += 16) {
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+ for (x = 0; x < dst_w; x += 16) {
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+ const uint8_t *src_ptr = srcs[i] +
+ (y / factor) * src_h / dst_h * src_stride +
+ (x / factor) * src_w / dst_w;
+ uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+ CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor,
+ bd);
+ } else {
+ vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor);
+ }
+ }
+ }
+ }
+
+ vpx_extend_frame_borders(dst);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+static int scale_down(VP9_COMP *cpi, int q) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int scale = 0;
+ assert(frame_is_kf_gf_arf(cpi));
+
+ if (rc->frame_size_selector == UNSCALED &&
+ q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+ const int max_size_thresh =
+ (int)(rate_thresh_mult[SCALE_STEP1] *
+ VPXMAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+ scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+ }
+ return scale;
+}
+
+static int big_rate_miss_high_threshold(VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int big_miss_high;
+
+ if (frame_is_kf_gf_arf(cpi))
+ big_miss_high = rc->this_frame_target * 3 / 2;
+ else
+ big_miss_high = rc->this_frame_target * 2;
+
+ return big_miss_high;
+}
+
+static int big_rate_miss(VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int big_miss_high;
+ int big_miss_low;
+
+ // Ignore for overlay frames
+ if (rc->is_src_frame_alt_ref) {
+ return 0;
+ } else {
+ big_miss_low = (rc->this_frame_target / 2);
+ big_miss_high = big_rate_miss_high_threshold(cpi);
+
+ return (rc->projected_frame_size > big_miss_high) ||
+ (rc->projected_frame_size < big_miss_low);
+ }
+}
+
+// test in two pass for the first
+static int two_pass_first_group_inter(VP9_COMP *cpi) {
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int gfg_index = gf_group->index;
+
+ if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE;
+ return gf_group->update_type[gfg_index - 1] != LF_UPDATE &&
+ gf_group->update_type[gfg_index] == LF_UPDATE;
+ } else {
+ return 0;
+ }
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
+ int maxq, int minq) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+ int force_recode = 0;
+
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) ||
+ (two_pass_first_group_inter(cpi) &&
+ (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) ||
+ (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) {
+ if (frame_is_kfgfarf && (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+ scale_down(cpi, q)) {
+ // Code this group at a lower resolution.
+ cpi->resize_pending = 1;
+ return 1;
+ }
+
+ // Force recode for extreme overshoot.
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+ rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) {
+ return 1;
+ }
+
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
+ if ((rc->projected_frame_size > high_limit && q < maxq) ||
+ (rc->projected_frame_size < low_limit && q > minq)) {
+ force_recode = 1;
+ } else if (cpi->oxcf.rc_mode == VPX_CQ) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
+ if (q > oxcf->cq_level &&
+ rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+ force_recode = 1;
+ }
+ }
+ }
+ return force_recode;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void update_ref_frames(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+ if (cpi->rc.show_arf_as_gld) {
+ int tmp = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->gld_fb_idx;
+ cpi->gld_fb_idx = tmp;
+ } else if (cm->show_existing_frame) {
+ // Pop ARF.
+ cpi->lst_fb_idx = cpi->alt_fb_idx;
+ cpi->alt_fb_idx =
+ stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+ --gf_group->stack_size;
+ }
+
+ // At this point the new frame has been encoded.
+ // If any buffer copy / swapping is signaled it should be done here.
+ if (cm->frame_type == KEY_FRAME) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->new_fb_idx);
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+ cm->new_fb_idx);
+ } else if (vp9_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term in function
+ // vp9_get_refresh_mask() we left it in the GF slot and, if
+ // we're updating the GF with the current decoded frame, we save it to the
+ // ARF slot instead.
+ // We now have to update the ARF with the current frame and swap gld_fb_idx
+ // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+ // slot and, if we're updating the GF, the current frame becomes the new GF.
+ int tmp;
+
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
+ cm->new_fb_idx);
+
+ tmp = cpi->alt_fb_idx;
+ cpi->alt_fb_idx = cpi->gld_fb_idx;
+ cpi->gld_fb_idx = tmp;
+ } else { /* For non key/golden frames */
+ if (cpi->refresh_alt_ref_frame) {
+ int arf_idx = gf_group->top_arf_idx;
+
+ // Push new ARF into stack.
+ stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx,
+ gf_group->stack_size);
+ ++gf_group->stack_size;
+
+ assert(arf_idx < REF_FRAMES);
+
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+ memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+
+ cpi->alt_fb_idx = arf_idx;
+ }
+
+ if (cpi->refresh_golden_frame) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ else
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[ALTREF_FRAME],
+ sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
+ }
+ }
+
+ if (cpi->refresh_last_frame) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+ cm->new_fb_idx);
+ if (!cpi->rc.is_src_frame_alt_ref)
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+ if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) {
+ cpi->alt_fb_idx =
+ stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+ --gf_group->stack_size;
+ }
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+ update_ref_frames(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ vp9_denoiser_update_ref_frame(cpi);
+#endif
+
+ if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
+}
+
+static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ struct loopfilter *lf = &cm->lf;
+ int is_reference_frame =
+ (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+ cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+ if (cpi->use_svc &&
+ cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+ is_reference_frame = !cpi->svc.non_reference_frame;
+
+ // Skip loop filter in show_existing_frame mode.
+ if (cm->show_existing_frame) {
+ lf->filter_level = 0;
+ return;
+ }
+
+ if (cpi->loopfilter_ctrl == NO_LOOPFILTER ||
+ (!is_reference_frame && cpi->loopfilter_ctrl == LOOPFILTER_REFERENCE)) {
+ lf->filter_level = 0;
+ vpx_extend_frame_inner_borders(cm->frame_to_show);
+ return;
+ }
+
+ if (xd->lossless) {
+ lf->filter_level = 0;
+ lf->last_filt_level = 0;
+ } else {
+ struct vpx_usec_timer timer;
+
+ vpx_clear_system_state();
+
+ vpx_usec_timer_start(&timer);
+
+ if (!cpi->rc.is_src_frame_alt_ref) {
+ if ((cpi->common.frame_type == KEY_FRAME) &&
+ (!cpi->rc.this_key_frame_forced)) {
+ lf->last_filt_level = 0;
+ }
+ vp9_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+ lf->last_filt_level = lf->filter_level;
+ } else {
+ lf->filter_level = 0;
+ }
+
+ vpx_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
+ }
+
+ if (lf->filter_level > 0 && is_reference_frame) {
+ vp9_build_mask_frame(cm, lf->filter_level, 0);
+
+ if (cpi->num_workers > 1)
+ vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+ lf->filter_level, 0, 0, cpi->workers,
+ cpi->num_workers, &cpi->lf_row_sync);
+ else
+ vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+ }
+
+ vpx_extend_frame_inner_borders(cm->frame_to_show);
+}
+
+void vp9_scale_references(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+ const VP9_REFFRAME ref_mask[3] = { VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_buffer(cpi, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ continue;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ RefCntBuffer *new_fb_ptr = NULL;
+ int force_scaling = 0;
+ int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+ if (new_fb == INVALID_IDX) {
+ new_fb = get_free_fb(cm);
+ force_scaling = 1;
+ }
+ if (new_fb == INVALID_IDX) return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
+ if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height) {
+ if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ cm->use_highbitdepth,
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth,
+ EIGHTTAP, 0);
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+#else
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ RefCntBuffer *new_fb_ptr = NULL;
+ int force_scaling = 0;
+ int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+ if (new_fb == INVALID_IDX) {
+ new_fb = get_free_fb(cm);
+ force_scaling = 1;
+ }
+ if (new_fb == INVALID_IDX) return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
+ if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height) {
+ if (vpx_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ vp9_scale_and_extend_frame(ref, &new_fb_ptr->buf, EIGHTTAP, 0);
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ int buf_idx;
+ RefCntBuffer *buf = NULL;
+ if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+ // Check for release of scaled reference.
+ buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
+ if (buf_idx != INVALID_IDX) {
+ buf = &pool->frame_bufs[buf_idx];
+ --buf->ref_count;
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ }
+ }
+ buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ buf = &pool->frame_bufs[buf_idx];
+ buf->buf.y_crop_width = ref->y_crop_width;
+ buf->buf.y_crop_height = ref->y_crop_height;
+ cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+ ++buf->ref_count;
+ }
+ } else {
+ if (cpi->oxcf.pass != 0 || cpi->use_svc)
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ }
+ }
+}
+
+static void release_scaled_references(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int i;
+ if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
+ // Only release scaled references under certain conditions:
+ // if reference will be updated, or if scaled reference has same resolution.
+ int refresh[3];
+ refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+ refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
+ refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const int idx = cpi->scaled_ref_idx[i - 1];
+ if (idx != INVALID_IDX) {
+ RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+ const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+ if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+ buf->buf.y_crop_height == ref->y_crop_height)) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < REFS_PER_FRAME; ++i) {
+ const int idx = cpi->scaled_ref_idx[i];
+ if (idx != INVALID_IDX) {
+ RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+ }
+ }
+}
+
+static void full_to_model_count(unsigned int *model_count,
+ unsigned int *full_count) {
+ int n;
+ model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
+ model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
+ model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
+ for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
+ model_count[TWO_TOKEN] += full_count[n];
+ model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
+}
+
+static void full_to_model_counts(vp9_coeff_count_model *model_count,
+ vp9_coeff_count *full_count) {
+ int i, j, k, l;
+
+ for (i = 0; i < PLANE_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+ full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
+}
+
+#if 0 && CONFIG_INTERNAL_STATS
+static void output_frame_level_debug_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
+ int64_t recon_err;
+
+ vpx_clear_system_state();
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ recon_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ } else {
+ recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
+ recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+
+ if (cpi->twopass.total_left_stats.coded_error != 0.0) {
+ double dc_quant_devisor;
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ dc_quant_devisor = 4.0;
+ break;
+ case VPX_BITS_10:
+ dc_quant_devisor = 16.0;
+ break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ dc_quant_devisor = 64.0;
+ break;
+ }
+#else
+ dc_quant_devisor = 4.0;
+#endif
+
+ if (!cm->current_video_frame) {
+ fprintf(f, "frame, width, height, last ts, last end ts, "
+ "source_alt_ref_pending, source_alt_ref_active, "
+ "this_frame_target, projected_frame_size, "
+ "projected_frame_size / MBs, "
+ "projected_frame_size - this_frame_target, "
+ "vbr_bits_off_target, vbr_bits_off_target_fast, "
+ "twopass.extend_minq, twopass.extend_minq_fast, "
+ "total_target_vs_actual, "
+ "starting_buffer_level - bits_off_target, "
+ "total_actual_bits, base_qindex, q for base_qindex, "
+ "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, "
+ "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, "
+ "frame_type, gfu_boost, "
+ "twopass.bits_left, "
+ "twopass.total_left_stats.coded_error, "
+ "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), "
+ "tot_recode_hits, recon_err, kf_boost, "
+ "twopass.kf_zeromotion_pct, twopass.fr_content_type, "
+ "filter_level, seg.aq_av_offset\n");
+ }
+
+ fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, "
+ "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", "
+ "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, "
+ "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, "
+ "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n",
+ cpi->common.current_video_frame,
+ cm->width, cm->height,
+ cpi->last_time_stamp_seen,
+ cpi->last_end_time_stamp_seen,
+ cpi->rc.source_alt_ref_pending,
+ cpi->rc.source_alt_ref_active,
+ cpi->rc.this_frame_target,
+ cpi->rc.projected_frame_size,
+ cpi->rc.projected_frame_size / cpi->common.MBs,
+ (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+ cpi->rc.vbr_bits_off_target,
+ cpi->rc.vbr_bits_off_target_fast,
+ cpi->twopass.extend_minq,
+ cpi->twopass.extend_minq_fast,
+ cpi->rc.total_target_vs_actual,
+ (cpi->rc.starting_buffer_level - cpi->rc.bits_off_target),
+ cpi->rc.total_actual_bits, cm->base_qindex,
+ vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth),
+ (double)vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) /
+ dc_quant_devisor,
+ vp9_convert_qindex_to_q(cpi->twopass.active_worst_quality,
+ cm->bit_depth),
+ cpi->rc.avg_q,
+ vp9_convert_qindex_to_q(cpi->oxcf.cq_level, cm->bit_depth),
+ cpi->refresh_last_frame, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+ cpi->twopass.bits_left,
+ cpi->twopass.total_left_stats.coded_error,
+ cpi->twopass.bits_left /
+ (1 + cpi->twopass.total_left_stats.coded_error),
+ cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+ cpi->twopass.kf_zeromotion_pct,
+ cpi->twopass.fr_content_type,
+ cm->lf.filter_level,
+ cm->seg.aq_av_offset);
+ }
+ fclose(f);
+
+ if (0) {
+ FILE *const fmodes = fopen("Modes.stt", "a");
+ int i;
+
+ fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame,
+ cm->frame_type, cpi->refresh_golden_frame,
+ cpi->refresh_alt_ref_frame);
+
+ for (i = 0; i < MAX_MODES; ++i)
+ fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+ fprintf(fmodes, "\n");
+
+ fclose(fmodes);
+ }
+}
+#endif
+
+static void set_mv_search_params(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const unsigned int max_mv_def = VPXMIN(cm->width, cm->height);
+
+ // Default based on max resolution.
+ cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param = vp9_init_search_range(
+ VPXMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ }
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+}
+
+static void set_size_independent_vars(VP9_COMP *cpi) {
+ vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+ vp9_set_rd_speed_thresholds(cpi);
+ vp9_set_rd_speed_thresholds_sub8x8(cpi);
+ cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
+ int *top_index) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ // Setup variables that depend on the dimensions of the frame.
+ vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
+
+ // Decide q and q bounds.
+ *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+ if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) {
+ *q = cpi->rc.worst_quality;
+ cpi->rc.force_max_q = 0;
+ }
+
+ if (cpi->use_svc) {
+ cpi->svc.base_qindex[cpi->svc.spatial_layer_id] = *q;
+ }
+
+ if (!frame_is_intra_only(cm)) {
+ vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
+ configure_static_seg_features(cpi);
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ int l = 0;
+ switch (cpi->oxcf.noise_sensitivity) {
+ case 1: l = 20; break;
+ case 2: l = 40; break;
+ case 3: l = 60; break;
+ case 4:
+ case 5: l = 100; break;
+ case 6: l = 150; break;
+ }
+ if (!cpi->common.postproc_state.limits) {
+ CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
+ vpx_calloc(cpi->un_scaled_source->y_width,
+ sizeof(*cpi->common.postproc_state.limits)));
+ }
+ vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l,
+ cpi->common.postproc_state.limits);
+ }
+#endif // CONFIG_VP9_POSTPROC
+}
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+ int y_stride = cpi->scaled_source.y_stride;
+
+ if (cpi->sf.mv.search_method == NSTEP) {
+ vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+ } else if (cpi->sf.mv.search_method == DIAMOND) {
+ vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+ }
+}
+
+static void set_frame_size(VP9_COMP *cpi) {
+ int ref_frame;
+ VP9_COMMON *const cm = &cpi->common;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+#if !CONFIG_REALTIME_ONLY
+ if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR &&
+ ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+ (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+ calculate_coded_size(cpi, &oxcf->scaled_frame_width,
+ &oxcf->scaled_frame_height);
+
+ // There has been a change in frame size.
+ vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR &&
+ oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) {
+ // For SVC scaled width/height will have been set (svc->resize_set=1)
+ // in get_svc_params based on the layer width/height.
+ if (!cpi->use_svc || !cpi->svc.resize_set) {
+ oxcf->scaled_frame_width =
+ (oxcf->width * cpi->resize_scale_num) / cpi->resize_scale_den;
+ oxcf->scaled_frame_height =
+ (oxcf->height * cpi->resize_scale_num) / cpi->resize_scale_den;
+ // There has been a change in frame size.
+ vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+ oxcf->scaled_frame_height);
+ }
+
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+
+ vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ // Reset the denoiser on the resized frame.
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ vp9_denoiser_free(&(cpi->denoiser));
+ setup_denoiser_buffer(cpi);
+ // Dynamic resize is only triggered for non-SVC, so we can force
+ // golden frame update here as temporary fix to denoiser.
+ cpi->refresh_golden_frame = 1;
+ }
+#endif
+ }
+
+ if ((oxcf->pass == 2) && !cpi->use_svc) {
+ vp9_set_target_rate(cpi);
+ }
+
+ alloc_frame_mvs(cm, cm->new_fb_idx);
+
+ // Reset the frame pointers to the current frame size.
+ if (vpx_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ alloc_util_frame_buffers(cpi);
+ init_motion_estimation(cpi);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+ ref_buf->idx = buf_idx;
+
+ if (buf_idx != INVALID_IDX) {
+ YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+ ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(
+ &ref_buf->sf, buf->y_crop_width, buf->y_crop_height, cm->width,
+ cm->height, (buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0);
+#else
+ vp9_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
+ buf->y_crop_height, cm->width,
+ cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
+ } else {
+ ref_buf->buf = NULL;
+ }
+ }
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void save_encode_params(VP9_COMP *cpi) {
+ int tile_idx;
+ int i, j;
+ TileDataEnc *tile_data;
+ RD_OPT *rd_opt = &cpi->rd;
+ for (i = 0; i < MAX_REF_FRAMES; i++) {
+ for (j = 0; j < REFERENCE_MODES; j++)
+ rd_opt->prediction_type_threshes_prev[i][j] =
+ rd_opt->prediction_type_threshes[i][j];
+
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+ rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
+ }
+
+ for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+ assert(cpi->tile_data);
+ tile_data = &cpi->tile_data[tile_idx];
+ vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
+ }
+}
+
+static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
+#ifdef ENABLE_KF_DENOISE
+ if (is_spatial_denoise_enabled(cpi)) {
+ cpi->raw_source_frame = vp9_scale_if_required(
+ cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+ (oxcf->pass == 0), EIGHTTAP, 0);
+ } else {
+ cpi->raw_source_frame = cpi->Source;
+ }
+#else
+ cpi->raw_source_frame = cpi->Source;
+#endif
+}
+
+static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
+ uint8_t *dest) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ int q = 0, bottom_index = 0, top_index = 0;
+ int no_drop_scene_change = 0;
+ const INTERP_FILTER filter_scaler =
+ (is_one_pass_svc(cpi))
+ ? svc->downsample_filter_type[svc->spatial_layer_id]
+ : EIGHTTAP;
+ const int phase_scaler =
+ (is_one_pass_svc(cpi))
+ ? svc->downsample_filter_phase[svc->spatial_layer_id]
+ : 0;
+
+ if (cm->show_existing_frame) {
+ cpi->rc.this_frame_target = 0;
+ if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+ return 1;
+ }
+
+ svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe;
+
+ // Flag to check if its valid to compute the source sad (used for
+ // scene detection and for superblock content state in CBR mode).
+ // The flag may get reset below based on SVC or resizing state.
+ cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME;
+
+ vpx_clear_system_state();
+
+ set_frame_size(cpi);
+
+ if (is_one_pass_svc(cpi) &&
+ cpi->un_scaled_source->y_width == cm->width << 2 &&
+ cpi->un_scaled_source->y_height == cm->height << 2 &&
+ svc->scaled_temp.y_width == cm->width << 1 &&
+ svc->scaled_temp.y_height == cm->height << 1) {
+ // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
+ // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
+ // result will be saved in scaled_temp and might be used later.
+ const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
+ const int phase_scaler2 = svc->downsample_filter_phase[1];
+ cpi->Source = vp9_svc_twostage_scale(
+ cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
+ filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
+ svc->scaled_one_half = 1;
+ } else if (is_one_pass_svc(cpi) &&
+ cpi->un_scaled_source->y_width == cm->width << 1 &&
+ cpi->un_scaled_source->y_height == cm->height << 1 &&
+ svc->scaled_one_half) {
+ // If the spatial layer is 1/2x1/2 and the scaling is already done in the
+ // two-stage scaling, use the result directly.
+ cpi->Source = &svc->scaled_temp;
+ svc->scaled_one_half = 0;
+ } else {
+ cpi->Source = vp9_scale_if_required(
+ cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
+ filter_scaler, phase_scaler);
+ }
+#ifdef OUTPUT_YUV_SVC_SRC
+ // Write out at most 3 spatial layers.
+ if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
+ vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
+ }
+#endif
+ // Unfiltered raw source used in metrics calculation if the source
+ // has been filtered.
+ if (is_psnr_calc_enabled(cpi)) {
+#ifdef ENABLE_KF_DENOISE
+ if (is_spatial_denoise_enabled(cpi)) {
+ cpi->raw_source_frame = vp9_scale_if_required(
+ cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, phase_scaler);
+ } else {
+ cpi->raw_source_frame = cpi->Source;
+ }
+#else
+ cpi->raw_source_frame = cpi->Source;
+#endif
+ }
+
+ if ((cpi->use_svc &&
+ (svc->spatial_layer_id < svc->number_spatial_layers - 1 ||
+ svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+ svc->current_superframe < 1)) ||
+ cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
+ cpi->resize_state != ORIG) {
+ cpi->compute_source_sad_onepass = 0;
+ if (cpi->content_state_sb_fd != NULL)
+ memset(cpi->content_state_sb_fd, 0,
+ (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) *
+ sizeof(*cpi->content_state_sb_fd));
+ }
+
+ // Avoid scaling last_source unless its needed.
+ // Last source is needed if avg_source_sad() is used, or if
+ // partition_search_type == SOURCE_VAR_BASED_PARTITION, or if noise
+ // estimation is enabled.
+ if (cpi->unscaled_last_source != NULL &&
+ (cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+ cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) ||
+ cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION ||
+ (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) ||
+ cpi->compute_source_sad_onepass))
+ cpi->Last_Source = vp9_scale_if_required(
+ cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
+ (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+
+ if (cpi->Last_Source == NULL ||
+ cpi->Last_Source->y_width != cpi->Source->y_width ||
+ cpi->Last_Source->y_height != cpi->Source->y_height)
+ cpi->compute_source_sad_onepass = 0;
+
+ if (frame_is_intra_only(cm) || cpi->resize_pending != 0) {
+ memset(cpi->consec_zero_mv, 0,
+ cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
+ }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc)
+ vp9_denoiser_reset_on_first_frame(cpi);
+#endif
+
+ // Scene detection is always used for VBR mode or screen-content case.
+ // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+ // (need to check encoding time cost for doing this for speed 8).
+ cpi->rc.high_source_sad = 0;
+ cpi->rc.hybrid_intra_scene_change = 0;
+ cpi->rc.re_encode_maxq_scene_change = 0;
+ if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+ (cpi->oxcf.rc_mode == VPX_VBR ||
+ cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
+ vp9_scene_detection_onepass(cpi);
+
+ if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+ svc->high_source_sad_superframe = cpi->rc.high_source_sad;
+ svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion;
+ // On scene change reset temporal layer pattern to TL0.
+ // Note that if the base/lower spatial layers are skipped: instead of
+ // inserting base layer here, we force max-q for the next superframe
+ // with lower spatial layers: this is done in vp9_encodedframe_overshoot()
+ // when max-q is decided for the current layer.
+ // Only do this reset for bypass/flexible mode.
+ if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 &&
+ svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ // rc->high_source_sad will get reset so copy it to restore it.
+ int tmp_high_source_sad = cpi->rc.high_source_sad;
+ vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME);
+ cpi->rc.high_source_sad = tmp_high_source_sad;
+ }
+ }
+
+ vp9_update_noise_estimate(cpi);
+
+ // For 1 pass CBR, check if we are dropping this frame.
+ // Never drop on key frame, if base layer is key for svc,
+ // on scene change, or if superframe has layer sync.
+ if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) &&
+ !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0]))
+ no_drop_scene_change = 1;
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+ !frame_is_intra_only(cm) && !no_drop_scene_change &&
+ !svc->superframe_has_layer_sync &&
+ (!cpi->use_svc ||
+ !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+ if (vp9_rc_drop_frame(cpi)) return 0;
+ }
+
+ // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
+ // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
+ // avoid this frame-level upsampling (for non intra_only frames).
+ // For SVC single_layer mode, dynamic resize is allowed and we need to
+ // scale references for this case.
+ if (frame_is_intra_only(cm) == 0 &&
+ ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
+ !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
+ vp9_scale_references(cpi);
+ }
+
+ set_size_independent_vars(cpi);
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ // search method and step parameter might be changed in speed settings.
+ init_motion_estimation(cpi);
+
+ if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
+
+ if (cpi->sf.svc_use_lowres_part &&
+ svc->spatial_layer_id == svc->number_spatial_layers - 2) {
+ if (svc->prev_partition_svc == NULL) {
+ CHECK_MEM_ERROR(
+ &cm->error, svc->prev_partition_svc,
+ (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+ sizeof(*svc->prev_partition_svc)));
+ }
+ }
+
+ // TODO(jianj): Look into issue of skin detection with high bitdepth.
+ if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
+ cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ cpi->use_skin_detection = 1;
+ }
+
+ // Enable post encode frame dropping for CBR on non key frame, when
+ // ext_use_post_encode_drop is specified by user.
+ cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop &&
+ cpi->oxcf.rc_mode == VPX_CBR &&
+ cm->frame_type != KEY_FRAME;
+
+ vp9_set_quantizer(cpi, q);
+ vp9_set_variance_partition_thresholds(cpi, q, 0);
+
+ setup_frame(cpi);
+
+ suppress_active_map(cpi);
+
+ if (cpi->use_svc) {
+ // On non-zero spatial layer, check for disabling inter-layer
+ // prediction.
+ if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
+ vp9_svc_assert_constraints_pattern(cpi);
+ }
+
+ if (cpi->rc.last_post_encode_dropped_scene_change) {
+ cpi->rc.high_source_sad = 1;
+ svc->high_source_sad_superframe = 1;
+ // For now disable use_source_sad since Last_Source will not be the previous
+ // encoded but the dropped one.
+ cpi->sf.use_source_sad = 0;
+ cpi->rc.last_post_encode_dropped_scene_change = 0;
+ }
+ // Check if this high_source_sad (scene/slide change) frame should be
+ // encoded at high/max QP, and if so, set the q and adjust some rate
+ // control parameters.
+ if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ &&
+ (cpi->rc.high_source_sad ||
+ (cpi->use_svc && svc->high_source_sad_superframe))) {
+ if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
+ vp9_set_quantizer(cpi, q);
+ vp9_set_variance_partition_thresholds(cpi, q, 0);
+ }
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ vp9_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+ vp9_360aq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ vp9_setup_in_frame_q_adj(cpi);
+ } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
+ // it may be pretty bad for rate-control,
+ // and I should handle it somehow
+ vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+ } else {
+#endif
+ // If ROI is enabled and skip feature is used for segmentation, apply cyclic
+ // refresh but not apply ROI for skip for the first 20 frames (defined by
+ // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality.
+ if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+ if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_setup(cpi);
+ if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)
+ apply_roi_map(cpi);
+ } else {
+ apply_roi_map(cpi);
+ }
+ } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ vp9_cyclic_refresh_setup(cpi);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ }
+#endif
+
+ apply_active_map(cpi);
+
+ vp9_encode_frame(cpi);
+
+ // Check if we should re-encode this frame at high Q because of high
+ // overshoot based on the encoded frame size. Only for frames where
+ // high temporal-source SAD is detected.
+ // For SVC: all spatial layers are checked for re-encoding.
+ if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+ (cpi->rc.high_source_sad ||
+ (cpi->use_svc && svc->high_source_sad_superframe))) {
+ int frame_size = 0;
+ // Get an estimate of the encoded frame size.
+ save_coding_context(cpi);
+ vp9_pack_bitstream(cpi, dest, size);
+ restore_coding_context(cpi);
+ frame_size = (int)(*size) << 3;
+ // Check if encoded frame will overshoot too much, and if so, set the q and
+ // adjust some rate control parameters, and return to re-encode the frame.
+ if (vp9_encodedframe_overshoot(cpi, frame_size, &q)) {
+ vpx_clear_system_state();
+ vp9_set_quantizer(cpi, q);
+ vp9_set_variance_partition_thresholds(cpi, q, 0);
+ suppress_active_map(cpi);
+ // Turn-off cyclic refresh for re-encoded frame.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+ memset(cr->last_coded_q_map, MAXQ,
+ cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+ cr->sb_index = 0;
+ vp9_disable_segmentation(&cm->seg);
+ }
+ apply_active_map(cpi);
+ vp9_encode_frame(cpi);
+ }
+ }
+
+ // Update some stats from cyclic refresh, and check for golden frame update.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ !frame_is_intra_only(cm) && cpi->cyclic_refresh->content_mode)
+ vp9_cyclic_refresh_postencode(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+ vpx_clear_system_state();
+ return 1;
+}
+
+static int get_ref_frame_flags(const VP9_COMP *cpi) {
+ const int *const map = cpi->common.ref_frame_map;
+ const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+ const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+ const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+ int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
+
+ if (gold_is_last) flags &= ~VP9_GOLD_FLAG;
+
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+ (cpi->svc.number_temporal_layers == 1 &&
+ cpi->svc.number_spatial_layers == 1))
+ flags &= ~VP9_GOLD_FLAG;
+
+ if (alt_is_last) flags &= ~VP9_ALT_FLAG;
+
+ if (gold_is_alt) flags &= ~VP9_ALT_FLAG;
+
+ return flags;
+}
+
+#if !CONFIG_REALTIME_ONLY
+#define MAX_QSTEP_ADJ 4
+static int get_qstep_adj(int rate_excess, int rate_limit) {
+ int qstep =
+ rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX;
+ return VPXMIN(qstep, MAX_QSTEP_ADJ);
+}
+
+#if CONFIG_RATE_CTRL
+static void init_rq_history(RATE_QINDEX_HISTORY *rq_history) {
+ rq_history->recode_count = 0;
+ rq_history->q_index_high = 255;
+ rq_history->q_index_low = 0;
+}
+
+static void update_rq_history(RATE_QINDEX_HISTORY *rq_history, int target_bits,
+ int actual_bits, int q_index) {
+ rq_history->q_index_history[rq_history->recode_count] = q_index;
+ rq_history->rate_history[rq_history->recode_count] = actual_bits;
+ if (actual_bits <= target_bits) {
+ rq_history->q_index_high = q_index;
+ }
+ if (actual_bits >= target_bits) {
+ rq_history->q_index_low = q_index;
+ }
+ rq_history->recode_count += 1;
+}
+
+static int guess_q_index_from_model(const RATE_QSTEP_MODEL *rq_model,
+ int target_bits) {
+ // The model predicts bits as follows.
+ // target_bits = bias - ratio * log2(q_step)
+ // Given the target_bits, we compute the q_step as follows.
+ double q_step;
+ assert(rq_model->ratio > 0);
+ q_step = pow(2.0, (rq_model->bias - target_bits) / rq_model->ratio);
+ // TODO(angiebird): Make this function support highbitdepth.
+ return vp9_convert_q_to_qindex(q_step, VPX_BITS_8);
+}
+
+static int guess_q_index_linear(int prev_q_index, int target_bits,
+ int actual_bits, int gap) {
+ int q_index = prev_q_index;
+ if (actual_bits < target_bits) {
+ q_index -= gap;
+ q_index = VPXMAX(q_index, 0);
+ } else {
+ q_index += gap;
+ q_index = VPXMIN(q_index, 255);
+ }
+ return q_index;
+}
+
+static double get_bits_percent_diff(int target_bits, int actual_bits) {
+ double diff;
+ target_bits = VPXMAX(target_bits, 1);
+ diff = abs(target_bits - actual_bits) * 1. / target_bits;
+ return diff * 100;
+}
+
+static int rq_model_predict_q_index(const RATE_QSTEP_MODEL *rq_model,
+ const RATE_QINDEX_HISTORY *rq_history,
+ int target_bits) {
+ int q_index = 128;
+ if (rq_history->recode_count > 0) {
+ const int actual_bits =
+ rq_history->rate_history[rq_history->recode_count - 1];
+ const int prev_q_index =
+ rq_history->q_index_history[rq_history->recode_count - 1];
+ const double percent_diff = get_bits_percent_diff(target_bits, actual_bits);
+ if (percent_diff > 50) {
+ // Binary search.
+ // When the actual_bits and target_bits are far apart, binary search
+ // q_index is faster.
+ q_index = (rq_history->q_index_low + rq_history->q_index_high) / 2;
+ } else {
+ if (rq_model->ready) {
+ q_index = guess_q_index_from_model(rq_model, target_bits);
+ } else {
+ // TODO(angiebird): Find a better way to set the gap.
+ q_index =
+ guess_q_index_linear(prev_q_index, target_bits, actual_bits, 20);
+ }
+ }
+ } else {
+ if (rq_model->ready) {
+ q_index = guess_q_index_from_model(rq_model, target_bits);
+ }
+ }
+
+ assert(rq_history->q_index_low <= rq_history->q_index_high);
+ if (q_index <= rq_history->q_index_low) {
+ q_index = rq_history->q_index_low + 1;
+ }
+ if (q_index >= rq_history->q_index_high) {
+ q_index = rq_history->q_index_high - 1;
+ }
+ return q_index;
+}
+
+static void rq_model_update(const RATE_QINDEX_HISTORY *rq_history,
+ int target_bits, RATE_QSTEP_MODEL *rq_model) {
+ const int recode_count = rq_history->recode_count;
+ const double delta = 0.00001;
+ if (recode_count >= 2) {
+ const int q_index1 = rq_history->q_index_history[recode_count - 2];
+ const int q_index2 = rq_history->q_index_history[recode_count - 1];
+ const int r1 = rq_history->rate_history[recode_count - 2];
+ const int r2 = rq_history->rate_history[recode_count - 1];
+ int valid = 0;
+ // lower q_index should yield higher bit rate
+ if (q_index1 < q_index2) {
+ valid = r1 > r2;
+ } else if (q_index1 > q_index2) {
+ valid = r1 < r2;
+ }
+ // Only update the model when the q_index and rate behave normally.
+ if (valid) {
+ // Fit the ratio and bias of rq_model based on last two recode histories.
+ const double s1 = vp9_convert_qindex_to_q(q_index1, VPX_BITS_8);
+ const double s2 = vp9_convert_qindex_to_q(q_index2, VPX_BITS_8);
+ if (fabs(log2(s1) - log2(s2)) > delta) {
+ rq_model->ratio = (r2 - r1) / (log2(s1) - log2(s2));
+ rq_model->bias = r1 + (rq_model->ratio) * log2(s1);
+ if (rq_model->ratio > delta && rq_model->bias > delta) {
+ rq_model->ready = 1;
+ }
+ }
+ }
+ } else if (recode_count == 1) {
+ if (rq_model->ready) {
+ // Update the ratio only when the initial model exists and we only have
+ // one recode history.
+ const int prev_q = rq_history->q_index_history[recode_count - 1];
+ const double prev_q_step = vp9_convert_qindex_to_q(prev_q, VPX_BITS_8);
+ if (fabs(log2(prev_q_step)) > delta) {
+ const int actual_bits = rq_history->rate_history[recode_count - 1];
+ rq_model->ratio =
+ rq_model->ratio + (target_bits - actual_bits) / log2(prev_q_step);
+ }
+ }
+ }
+}
+#endif // CONFIG_RATE_CTRL
+
+static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
+#if CONFIG_RATE_CTRL
+ ,
+ RATE_QINDEX_HISTORY *rq_history
+#endif // CONFIG_RATE_CTRL
+) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int bottom_index, top_index;
+ int loop_count = 0;
+ int loop_at_this_size = 0;
+ int loop = 0;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+ int q = 0, q_low = 0, q_high = 0;
+ int enable_acl;
+#ifdef AGGRESSIVE_VBR
+ int qrange_adj = 1;
+#endif
+
+ // A flag which indicates whether we are recoding the current frame
+ // when the current frame size is larger than the max frame size in the
+ // external rate control model.
+ // This flag doesn't have any impact when external rate control is not used.
+ int ext_rc_recode = 0;
+ // Maximal frame size allowed by the external rate control.
+ // case: 0, we ignore the max frame size limit, and encode with the qindex
+ // passed in by the external rate control model.
+ // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
+ // and may recode if undershoot/overshoot is seen.
+ // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
+ // case: -1, we take libvpx's decision for the max frame size, as well as
+ // the recode decision.
+ // Otherwise: if a specific size is given, libvpx's recode decision
+ // will respect the given size.
+ int ext_rc_max_frame_size = 0;
+ // Use VP9's decision of qindex. This flag is in use only in external rate
+ // control model to help determine whether to recode when
+ // |ext_rc_max_frame_size| is 0.
+ int ext_rc_use_default_q = 1;
+ const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
+
+#if CONFIG_RATE_CTRL
+ RATE_QSTEP_MODEL *rq_model;
+ {
+ const FRAME_UPDATE_TYPE update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+ rq_model = &cpi->rq_model[frame_type];
+ }
+ init_rq_history(rq_history);
+#endif // CONFIG_RATE_CTRL
+
+ if (cm->show_existing_frame) {
+ rc->this_frame_target = 0;
+ if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+ return;
+ }
+
+ set_size_independent_vars(cpi);
+
+ enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) ||
+ (cpi->twopass.gf_group.index == 1)
+ : 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame: \n");
+#endif
+ do {
+ vpx_clear_system_state();
+
+ set_frame_size(cpi);
+
+ if (loop_count == 0 || cpi->resize_pending != 0) {
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+#ifdef AGGRESSIVE_VBR
+ if (two_pass_first_group_inter(cpi)) {
+ // Adjustment limits for min and max q
+ qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2);
+
+ bottom_index =
+ VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q);
+ top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2);
+ }
+#endif
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+
+ // Reset the loop state for new frame size.
+ overshoot_seen = 0;
+ undershoot_seen = 0;
+
+ // Reconfiguration for change in frame size has concluded.
+ cpi->resize_pending = 0;
+
+ q_low = bottom_index;
+ q_high = top_index;
+
+ loop_at_this_size = 0;
+ }
+
+ // Decide frame size bounds first time through.
+ if (loop_count == 0) {
+ vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ }
+
+ cpi->Source =
+ vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
+ (oxcf->pass == 0), EIGHTTAP, 0);
+
+ // Unfiltered raw source used in metrics calculation if the source
+ // has been filtered.
+ if (is_psnr_calc_enabled(cpi)) {
+#ifdef ENABLE_KF_DENOISE
+ if (is_spatial_denoise_enabled(cpi)) {
+ cpi->raw_source_frame = vp9_scale_if_required(
+ cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+ (oxcf->pass == 0), EIGHTTAP, 0);
+ } else {
+ cpi->raw_source_frame = cpi->Source;
+ }
+#else
+ cpi->raw_source_frame = cpi->Source;
+#endif
+ }
+
+ if (cpi->unscaled_last_source != NULL)
+ cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source,
+ (oxcf->pass == 0), EIGHTTAP, 0);
+
+ if (frame_is_intra_only(cm) == 0) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ vp9_scale_references(cpi);
+ }
+
+#if CONFIG_RATE_CTRL
+ // TODO(angiebird): This is a hack for making sure the encoder use the
+ // external_quantize_index exactly. Avoid this kind of hack later.
+ if (cpi->oxcf.use_simple_encode_api) {
+ if (cpi->encode_command.use_external_target_frame_bits) {
+ q = rq_model_predict_q_index(rq_model, rq_history,
+ rc->this_frame_target);
+ }
+ if (cpi->encode_command.use_external_quantize_index) {
+ q = cpi->encode_command.external_quantize_index;
+ }
+ }
+#endif // CONFIG_RATE_CTRL
+ if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+ vpx_codec_err_t codec_status;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ vpx_rc_encodeframe_decision_t encode_frame_decision;
+ FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+ const int ref_frame_flags = get_ref_frame_flags(cpi);
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+ const RefCntBuffer *curr_frame_buf =
+ get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+ // index 1 refers to the first encoding frame in a gf group.
+ // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+ // See function define_gf_group_structure().
+ const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+ get_ref_frame_bufs(cpi, ref_frame_bufs);
+ codec_status = vp9_extrc_get_encodeframe_decision(
+ &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+ cm->current_frame_coding_index, gf_group->index, update_type,
+ gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+ &encode_frame_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_encodeframe_decision() failed");
+ }
+ // If the external model recommends a reserved value, we use
+ // libvpx's default q.
+ if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+ q = encode_frame_decision.q_index;
+ ext_rc_use_default_q = 0;
+ }
+ ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
+ }
+
+ vp9_set_quantizer(cpi, q);
+
+ if (loop_count == 0) setup_frame(cpi);
+
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (oxcf->aq_mode == VARIANCE_AQ) {
+ vp9_vaq_frame_setup(cpi);
+ } else if (oxcf->aq_mode == EQUATOR360_AQ) {
+ vp9_360aq_frame_setup(cpi);
+ } else if (oxcf->aq_mode == COMPLEXITY_AQ) {
+ vp9_setup_in_frame_q_adj(cpi);
+ } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
+ vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+ } else if (oxcf->aq_mode == PSNR_AQ) {
+ vp9_psnr_aq_mode_setup(&cm->seg);
+ }
+
+ vp9_encode_frame(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+
+ vpx_clear_system_state();
+
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+ save_coding_context(cpi);
+ if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size);
+
+ rc->projected_frame_size = (int)(*size) << 3;
+
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+ }
+
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+ // In general, for the external rate control, we take the qindex provided
+ // as input and encode the frame with this qindex faithfully. However,
+ // in some extreme scenarios, the provided qindex leads to a massive
+ // overshoot of frame size. In this case, we fall back to VP9's decision
+ // to pick a new qindex and recode the frame. We return the new qindex
+ // through the API to the external model.
+ if (ext_rc_max_frame_size == 0) {
+ if (!ext_rc_use_default_q) break;
+ } else if (ext_rc_max_frame_size == -1) {
+ // Do nothing, fall back to libvpx's recode decision.
+ } else {
+ // Change the max frame size, used in libvpx's recode decision.
+ rc->max_frame_bandwidth = ext_rc_max_frame_size;
+ }
+ ext_rc_recode = 1;
+ }
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ // This part needs to be after save_coding_context() because
+ // restore_coding_context will be called in the end of this function.
+ // TODO(angiebird): This is a hack for making sure the encoder use the
+ // external_quantize_index exactly. Avoid this kind of hack later.
+ if (cpi->encode_command.use_external_quantize_index) {
+ break;
+ }
+
+ if (cpi->encode_command.use_external_target_frame_bits) {
+ const double percent_diff = get_bits_percent_diff(
+ rc->this_frame_target, rc->projected_frame_size);
+ update_rq_history(rq_history, rc->this_frame_target,
+ rc->projected_frame_size, q);
+ loop_count += 1;
+
+ rq_model_update(rq_history, rc->this_frame_target, rq_model);
+
+ // Check if we hit the target bitrate.
+ if (percent_diff <=
+ cpi->encode_command.target_frame_bits_error_percent ||
+ rq_history->recode_count >= RATE_CTRL_MAX_RECODE_NUM ||
+ rq_history->q_index_low >= rq_history->q_index_high) {
+ break;
+ }
+
+ loop = 1;
+ restore_coding_context(cpi);
+ continue;
+ }
+ }
+#endif // CONFIG_RATE_CTRL
+
+ if (oxcf->rc_mode == VPX_Q) {
+ loop = 0;
+ } else {
+ if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
+ (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+ int last_q = q;
+ int64_t kf_err;
+
+ int64_t high_err_target = cpi->ambient_err;
+ int64_t low_err_target = cpi->ambient_err >> 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ } else {
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
+ kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ q_high = q > q_low ? q - 1 : q_low;
+
+ // Adjust Q
+ q = (int)((q * high_err_target) / kf_err);
+ q = VPXMIN(q, (q_high + q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ q_low = q < q_high ? q + 1 : q_high;
+
+ // Adjust Q
+ q = (int)((q * low_err_target) / kf_err);
+ q = VPXMIN(q, (q_high + q_low + 1) >> 1);
+ }
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = q != last_q;
+ } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+ frame_under_shoot_limit, q,
+ VPXMAX(q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+ int last_q = q;
+ int retries = 0;
+ int qstep;
+
+ if (cpi->resize_pending == 1) {
+ // Change in frame size so go back around the recode loop.
+ cpi->rc.frame_size_selector =
+ SCALE_STEP1 - cpi->rc.frame_size_selector;
+ cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ ++loop_count;
+ loop = 1;
+ continue;
+ }
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if ((q == q_high) &&
+ ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (!rc->is_src_frame_alt_ref &&
+ (rc->projected_frame_size >=
+ big_rate_miss_high_threshold(cpi))))) {
+ int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
+ big_rate_miss_high_threshold(cpi)));
+ double q_val_high;
+ q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth);
+ q_val_high =
+ q_val_high * ((double)rc->projected_frame_size / max_rate);
+ q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth);
+ q_high = clamp(q_high, rc->best_quality, rc->worst_quality);
+ }
+
+ // Raise Qlow as to at least the current value
+ qstep =
+ get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
+ q_low = VPXMIN(q + qstep, q_high);
+
+ if (undershoot_seen || loop_at_this_size > 1) {
+ // Update rate_correction_factor unless
+ vp9_rc_update_rate_correction_factors(cpi);
+
+ q = (q_high + q_low + 1) / 2;
+ } else {
+ // Update rate_correction_factor unless
+ vp9_rc_update_rate_correction_factors(cpi);
+
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ VPXMAX(q_high, top_index));
+
+ while (q < q_low && retries < 10) {
+ vp9_rc_update_rate_correction_factors(cpi);
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ VPXMAX(q_high, top_index));
+ retries++;
+ }
+ }
+
+ overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ qstep =
+ get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
+ q_high = VPXMAX(q - qstep, q_low);
+
+ if (overshoot_seen || loop_at_this_size > 1) {
+ vp9_rc_update_rate_correction_factors(cpi);
+ q = (q_high + q_low) / 2;
+ } else {
+ vp9_rc_update_rate_correction_factors(cpi);
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+ VPXMIN(q_low, bottom_index), top_index);
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passed in value.
+ if (oxcf->rc_mode == VPX_CQ && q < q_low) {
+ q_low = q;
+ }
+
+ while (q > q_high && retries < 10) {
+ vp9_rc_update_rate_correction_factors(cpi);
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+ VPXMIN(q_low, bottom_index), top_index);
+ retries++;
+ }
+ }
+ undershoot_seen = 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = (q != last_q);
+ } else {
+ loop = 0;
+ }
+ }
+
+ // Special case for overlay frame.
+ if (rc->is_src_frame_alt_ref &&
+ rc->projected_frame_size < rc->max_frame_bandwidth)
+ loop = 0;
+
+ if (loop) {
+ ++loop_count;
+ ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ }
+
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
+ if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (loop) printf("\n Recoding:");
+#endif
+ } while (loop);
+
+ rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
+
+#ifdef AGGRESSIVE_VBR
+ if (two_pass_first_group_inter(cpi)) {
+ cpi->twopass.active_worst_quality =
+ VPXMIN(q + qrange_adj, oxcf->worst_allowed_q);
+ } else if (!frame_is_kf_gf_arf(cpi)) {
+#else
+ if (!frame_is_kf_gf_arf(cpi)) {
+#endif
+ // Have we been forced to adapt Q outside the expected range by an extreme
+ // rate miss. If so adjust the active maxQ for the subsequent frames.
+ if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) {
+ cpi->twopass.active_worst_quality = q;
+ } else if (oxcf->vbr_corpus_complexity && q == q_low &&
+ rc->projected_frame_size < rc->this_frame_target) {
+ cpi->twopass.active_worst_quality =
+ VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+ }
+ }
+
+ if (enable_acl) {
+ // Skip recoding, if model diff is below threshold
+ const int thresh = compute_context_model_thresh(cpi);
+ const int diff = compute_context_model_diff(cm);
+ if (diff >= thresh) {
+ vp9_encode_frame(cpi);
+ }
+ }
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+ vpx_clear_system_state();
+ restore_coding_context(cpi);
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void set_ext_overrides(VP9_COMP *cpi) {
+ // Overrides the defaults with the externally supplied values with
+ // vp9_update_reference() and vp9_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to encode_frame_to_data_rate() function
+ if (cpi->ext_refresh_frame_context_pending) {
+ cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+ cpi->ext_refresh_frame_context_pending = 0;
+ }
+ if (cpi->ext_refresh_frame_flags_pending) {
+ cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+ cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+ cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+ }
+}
+
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->bit_depth == VPX_BITS_8) {
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type,
+ phase_scaler);
+ } else {
+ scale_and_extend_frame(unscaled, scaled_temp, (int)cm->bit_depth,
+ filter_type2, phase_scaler2);
+ scale_and_extend_frame(scaled_temp, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
+ }
+#else
+ vp9_scale_and_extend_frame(unscaled, scaled_temp, filter_type2,
+ phase_scaler2);
+ vp9_scale_and_extend_frame(scaled_temp, scaled, filter_type, phase_scaler);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler) {
+ if (cm->mi_cols * MI_SIZE != unscaled->y_width ||
+ cm->mi_rows * MI_SIZE != unscaled->y_height) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
+ unscaled->y_height <= (scaled->y_height << 1))
+ if (cm->bit_depth == VPX_BITS_8)
+ vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
+ else
+ scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
+ filter_type, phase_scaler);
+ else
+ scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+#else
+ if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
+ unscaled->y_height <= (scaled->y_height << 1))
+ vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
+ else
+ scale_and_extend_frame_nonnormative(unscaled, scaled);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
+static void set_ref_sign_bias(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ const int cur_frame_index = ref_buffer->frame_index;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ const RefCntBuffer *const ref_cnt_buf =
+ get_ref_cnt_buffer(&cpi->common, buf_idx);
+ if (ref_cnt_buf) {
+ cm->ref_frame_sign_bias[ref_frame] =
+ cur_frame_index < ref_cnt_buf->frame_index;
+ }
+ }
+}
+
+static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
+ INTERP_FILTER ifilter;
+ int ref_total[MAX_REF_FRAMES] = { 0 };
+ MV_REFERENCE_FRAME ref;
+ int mask = 0;
+ if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+ return mask;
+ for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+ ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+
+ for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+ if ((ref_total[LAST_FRAME] &&
+ cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+ (ref_total[GOLDEN_FRAME] == 0 ||
+ cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50 <
+ ref_total[GOLDEN_FRAME]) &&
+ (ref_total[ALTREF_FRAME] == 0 ||
+ cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50 <
+ ref_total[ALTREF_FRAME]))
+ mask |= 1 << ifilter;
+ }
+ return mask;
+}
+
+#ifdef ENABLE_KF_DENOISE
+// Baseline kernel weights for denoise
+static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+ 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
+
+static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
+ uint8_t point_weight, int *sum_val,
+ int *sum_weight) {
+ if (abs(centre_val - data_val) <= thresh) {
+ *sum_weight += point_weight;
+ *sum_val += (int)data_val * (int)point_weight;
+ }
+}
+
+static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
+ const int strength) {
+ int sum_weight = 0;
+ int sum_val = 0;
+ int thresh = strength;
+ int kernal_size = 5;
+ int half_k_size = 2;
+ int i, j;
+ int max_diff = 0;
+ uint8_t *tmp_ptr;
+ uint8_t *kernal_ptr;
+
+ // Find the maximum deviation from the source point in the locale.
+ tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
+ for (i = 0; i < kernal_size + 2; ++i) {
+ for (j = 0; j < kernal_size + 2; ++j) {
+ max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
+ }
+ tmp_ptr += stride;
+ }
+
+ // Select the kernel size.
+ if (max_diff > (strength + (strength >> 1))) {
+ kernal_size = 3;
+ half_k_size = 1;
+ thresh = thresh >> 1;
+ }
+ kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+
+ // Apply the kernel
+ tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
+ for (i = 0; i < kernal_size; ++i) {
+ for (j = 0; j < kernal_size; ++j) {
+ add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+ &sum_val, &sum_weight);
+ ++kernal_ptr;
+ }
+ tmp_ptr += stride;
+ }
+
+ // Update the source value with the new filtered value
+ *src_ptr = (uint8_t)((sum_val + (sum_weight >> 1)) / sum_weight);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
+ const int strength) {
+ int sum_weight = 0;
+ int sum_val = 0;
+ int thresh = strength;
+ int kernal_size = 5;
+ int half_k_size = 2;
+ int i, j;
+ int max_diff = 0;
+ uint16_t *tmp_ptr;
+ uint8_t *kernal_ptr;
+
+ // Find the maximum deviation from the source point in the locale.
+ tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
+ for (i = 0; i < kernal_size + 2; ++i) {
+ for (j = 0; j < kernal_size + 2; ++j) {
+ max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
+ }
+ tmp_ptr += stride;
+ }
+
+ // Select the kernel size.
+ if (max_diff > (strength + (strength >> 1))) {
+ kernal_size = 3;
+ half_k_size = 1;
+ thresh = thresh >> 1;
+ }
+ kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+
+ // Apply the kernel
+ tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
+ for (i = 0; i < kernal_size; ++i) {
+ for (j = 0; j < kernal_size; ++j) {
+ add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+ &sum_val, &sum_weight);
+ ++kernal_ptr;
+ }
+ tmp_ptr += stride;
+ }
+
+ // Update the source value with the new filtered value
+ *src_ptr = (uint16_t)((sum_val + (sum_weight >> 1)) / sum_weight);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// Apply thresholded spatial noise suppression to a given buffer.
+static void spatial_denoise_buffer(VP9_COMP *cpi, uint8_t *buffer,
+ const int stride, const int width,
+ const int height, const int strength) {
+ VP9_COMMON *const cm = &cpi->common;
+ uint8_t *src_ptr = buffer;
+ int row;
+ int col;
+
+ for (row = 0; row < height; ++row) {
+ for (col = 0; col < width; ++col) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ highbd_spatial_denoise_point(CONVERT_TO_SHORTPTR(&src_ptr[col]), stride,
+ strength);
+ else
+ spatial_denoise_point(&src_ptr[col], stride, strength);
+#else
+ spatial_denoise_point(&src_ptr[col], stride, strength);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ src_ptr += stride;
+ }
+}
+
+// Apply thresholded spatial noise suppression to source.
+static void spatial_denoise_frame(VP9_COMP *cpi) {
+ YV12_BUFFER_CONFIG *src = cpi->Source;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ VP9_COMMON *const cm = &cpi->common;
+
+ // Base the filter strength on the current active max Q.
+ const int q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
+ cm->bit_depth));
+ int strength =
+ VPXMAX(oxcf->arnr_strength >> 2, VPXMIN(oxcf->arnr_strength, (q >> 4)));
+
+ // Denoise each of Y,U and V buffers.
+ spatial_denoise_buffer(cpi, src->y_buffer, src->y_stride, src->y_width,
+ src->y_height, strength);
+
+ strength += (strength >> 1);
+ spatial_denoise_buffer(cpi, src->u_buffer, src->uv_stride, src->uv_width,
+ src->uv_height, strength << 1);
+
+ spatial_denoise_buffer(cpi, src->v_buffer, src->uv_stride, src->uv_width,
+ src->uv_height, strength << 1);
+}
+#endif // ENABLE_KF_DENOISE
+
+#if !CONFIG_REALTIME_ONLY
+static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
+ uint8_t *dest) {
+ if (cpi->common.seg.enabled)
+ if (ALT_REF_AQ_PROTECT_GAIN) {
+ size_t nsize = *size;
+ int overhead;
+
+ // TODO(yuryg): optimize this, as
+ // we don't really need to repack
+
+ save_coding_context(cpi);
+ vp9_disable_segmentation(&cpi->common.seg);
+ vp9_pack_bitstream(cpi, dest, &nsize);
+ restore_coding_context(cpi);
+
+ overhead = (int)*size - (int)nsize;
+
+ if (vp9_alt_ref_aq_disable_if(cpi->alt_ref_aq, overhead, (int)*size))
+ vp9_encode_frame(cpi);
+ else
+ vp9_enable_segmentation(&cpi->common.seg);
+ }
+}
+#endif
+
+static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) {
+ RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+
+ if (ref_buffer) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ ref_buffer->frame_index =
+ cm->current_video_frame + gf_group->arf_src_offset[gf_group->index];
+ ref_buffer->frame_coding_index = cm->current_frame_coding_index;
+ }
+}
+
+static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ uint8_t *y_buffer = cpi->Source->y_buffer;
+ const int y_stride = cpi->Source->y_stride;
+ const int block_size = BLOCK_16X16;
+
+ const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size];
+ const int num_8x8_h = num_8x8_blocks_high_lookup[block_size];
+ const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+ const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+ double log_sum = 0.0;
+ int row, col;
+
+ // Loop through each 64x64 block.
+ for (row = 0; row < num_rows; ++row) {
+ for (col = 0; col < num_cols; ++col) {
+ int mi_row, mi_col;
+ double var = 0.0, num_of_var = 0.0;
+ const int index = row * num_cols + col;
+
+ for (mi_row = row * num_8x8_h;
+ mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) {
+ for (mi_col = col * num_8x8_w;
+ mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) {
+ struct buf_2d buf;
+ const int row_offset_y = mi_row << 3;
+ const int col_offset_y = mi_col << 3;
+
+ buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+ buf.stride = y_stride;
+
+ // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit
+ // and high bit videos, the variance needs to be divided by 2.0 or
+ // 64.0 separately.
+ // TODO(sdeng): need to tune for 12bit videos.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+ var += vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd);
+ else
+#endif
+ var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8);
+
+ num_of_var += 1.0;
+ }
+ }
+ var = var / num_of_var / 64.0;
+
+ // Curve fitting with an exponential model on all 16x16 blocks from the
+ // Midres dataset.
+ var = 67.035434 * (1 - exp(-0.0021489 * var)) + 17.492222;
+ cpi->mi_ssim_rdmult_scaling_factors[index] = var;
+ log_sum += log(var);
+ }
+ }
+ log_sum = exp(log_sum / (double)(num_rows * num_cols));
+
+ for (row = 0; row < num_rows; ++row) {
+ for (col = 0; col < num_cols; ++col) {
+ const int index = row * num_cols + col;
+ cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum;
+ }
+ }
+
+ (void)xd;
+}
+
+// Process the wiener variance in 16x16 block basis.
+static int qsort_comp(const void *elem1, const void *elem2) {
+ int a = *((const int *)elem1);
+ int b = *((const int *)elem2);
+ if (a > b) return 1;
+ if (a < b) return -1;
+ return 0;
+}
+
+static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+
+ if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows &&
+ cpi->mb_wiener_var_cols >= cm->mb_cols)
+ return;
+
+ vpx_free(cpi->mb_wiener_variance);
+ cpi->mb_wiener_variance = NULL;
+
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->mb_wiener_variance,
+ vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
+ cpi->mb_wiener_var_rows = cm->mb_rows;
+ cpi->mb_wiener_var_cols = cm->mb_cols;
+}
+
+static void set_mb_wiener_variance(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ uint8_t *buffer = cpi->Source->y_buffer;
+ int buf_stride = cpi->Source->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]);
+ DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]);
+ uint8_t *zero_pred;
+#else
+ DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]);
+#endif
+
+ DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+
+ int mb_row, mb_col, count = 0;
+ // Hard coded operating block size
+ const int block_size = 16;
+ const int coeff_count = block_size * block_size;
+ const TX_SIZE tx_size = TX_16X16;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ xd->cur_buf = cpi->Source;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ zero_pred = CONVERT_TO_BYTEPTR(zero_pred16);
+ memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count);
+ } else {
+ zero_pred = zero_pred8;
+ memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count);
+ }
+#else
+ memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count);
+#endif
+
+ cpi->norm_wiener_variance = 0;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int idx;
+ int16_t median_val = 0;
+ uint8_t *mb_buffer =
+ buffer + mb_row * block_size * buf_stride + mb_col * block_size;
+ int64_t wiener_variance = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, zero_pred, block_size,
+ xd->bd);
+ vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+ } else {
+ vpx_subtract_block(block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, zero_pred, block_size);
+ vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+ }
+#else
+ vpx_subtract_block(block_size, block_size, src_diff, block_size,
+ mb_buffer, buf_stride, zero_pred, block_size);
+ vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ coeff[0] = 0;
+ for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]);
+
+ qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp);
+
+ // Noise level estimation
+ median_val = coeff[coeff_count / 2];
+
+ // Wiener filter
+ for (idx = 1; idx < coeff_count; ++idx) {
+ int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx];
+ int64_t tmp_coeff = (int64_t)coeff[idx];
+ if (median_val) {
+ tmp_coeff = (sqr_coeff * coeff[idx]) /
+ (sqr_coeff + (int64_t)median_val * median_val);
+ }
+ wiener_variance += tmp_coeff * tmp_coeff;
+ }
+ cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] =
+ wiener_variance / coeff_count;
+ cpi->norm_wiener_variance +=
+ cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col];
+ ++count;
+ }
+ }
+
+ if (count) cpi->norm_wiener_variance /= count;
+ cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void update_encode_frame_result_basic(
+ FRAME_UPDATE_TYPE update_type, int show_idx, int quantize_index,
+ ENCODE_FRAME_RESULT *encode_frame_result) {
+ encode_frame_result->show_idx = show_idx;
+ encode_frame_result->update_type = update_type;
+ encode_frame_result->quantize_index = quantize_index;
+}
+
+#if CONFIG_RATE_CTRL
+static void yv12_buffer_to_image_buffer(const YV12_BUFFER_CONFIG *yv12_buffer,
+ IMAGE_BUFFER *image_buffer) {
+ const uint8_t *src_buf_ls[3] = { yv12_buffer->y_buffer, yv12_buffer->u_buffer,
+ yv12_buffer->v_buffer };
+ const int src_stride_ls[3] = { yv12_buffer->y_stride, yv12_buffer->uv_stride,
+ yv12_buffer->uv_stride };
+ const int w_ls[3] = { yv12_buffer->y_crop_width, yv12_buffer->uv_crop_width,
+ yv12_buffer->uv_crop_width };
+ const int h_ls[3] = { yv12_buffer->y_crop_height, yv12_buffer->uv_crop_height,
+ yv12_buffer->uv_crop_height };
+ int plane;
+ for (plane = 0; plane < 3; ++plane) {
+ const int src_stride = src_stride_ls[plane];
+ const int w = w_ls[plane];
+ const int h = h_ls[plane];
+ const uint8_t *src_buf = src_buf_ls[plane];
+ uint8_t *dst_buf = image_buffer->plane_buffer[plane];
+ int r;
+ assert(image_buffer->plane_width[plane] == w);
+ assert(image_buffer->plane_height[plane] == h);
+ for (r = 0; r < h; ++r) {
+ memcpy(dst_buf, src_buf, sizeof(*src_buf) * w);
+ src_buf += src_stride;
+ dst_buf += w;
+ }
+ }
+}
+// This function will update extra information specific for simple_encode APIs
+static void update_encode_frame_result_simple_encode(
+ int ref_frame_flags, FRAME_UPDATE_TYPE update_type,
+ const YV12_BUFFER_CONFIG *source_frame, const RefCntBuffer *coded_frame_buf,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int quantize_index,
+ uint32_t bit_depth, uint32_t input_bit_depth, const FRAME_COUNTS *counts,
+ const PARTITION_INFO *partition_info,
+ const MOTION_VECTOR_INFO *motion_vector_info,
+ const TplDepStats *tpl_stats_info,
+ ENCODE_FRAME_RESULT *encode_frame_result) {
+ PSNR_STATS psnr;
+ update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+ quantize_index, encode_frame_result);
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_calc_highbd_psnr(source_frame, &coded_frame_buf->buf, &psnr, bit_depth,
+ input_bit_depth);
+#else // CONFIG_VP9_HIGHBITDEPTH
+ (void)bit_depth;
+ (void)input_bit_depth;
+ vpx_calc_psnr(source_frame, &coded_frame_buf->buf, &psnr);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ encode_frame_result->frame_coding_index = coded_frame_buf->frame_coding_index;
+
+ vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+ encode_frame_result->ref_frame_coding_indexes,
+ encode_frame_result->ref_frame_valid_list);
+
+ encode_frame_result->psnr = psnr.psnr[0];
+ encode_frame_result->sse = psnr.sse[0];
+ encode_frame_result->frame_counts = *counts;
+ encode_frame_result->partition_info = partition_info;
+ encode_frame_result->motion_vector_info = motion_vector_info;
+ encode_frame_result->tpl_stats_info = tpl_stats_info;
+ if (encode_frame_result->coded_frame.allocated) {
+ yv12_buffer_to_image_buffer(&coded_frame_buf->buf,
+ &encode_frame_result->coded_frame);
+ }
+}
+#endif // CONFIG_RATE_CTRL
+#endif // !CONFIG_REALTIME_ONLY
+
+static void encode_frame_to_data_rate(
+ VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags,
+ ENCODE_FRAME_RESULT *encode_frame_result) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
+ TX_SIZE t;
+
+ // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+ // No need to set svc.skip_enhancement_layer if whole superframe will be
+ // dropped.
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+ cpi->oxcf.target_bandwidth == 0 &&
+ !(cpi->svc.framedrop_mode != LAYER_DROP &&
+ (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+ cpi->svc
+ .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+ 1]) &&
+ cpi->svc.drop_spatial_layer[0])) {
+ cpi->svc.skip_enhancement_layer = 1;
+ vp9_rc_postencode_update_drop_frame(cpi);
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->last_frame_dropped = 1;
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+ cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+ vp9_inc_frame_in_layer(cpi);
+ return;
+ }
+
+ set_ext_overrides(cpi);
+ vpx_clear_system_state();
+
+#ifdef ENABLE_KF_DENOISE
+ // Spatial denoise of key frame.
+ if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi);
+#endif
+
+ if (cm->show_existing_frame == 0) {
+ // Update frame index
+ set_frame_index(cpi, cm);
+
+ // Set the arf sign bias for this frame.
+ set_ref_sign_bias(cpi);
+ }
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+ cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm)) {
+ // Reset the loop filter deltas and segmentation map.
+ vp9_reset_segment_features(&cm->seg);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame.
+ cpi->rc.source_alt_ref_active = 0;
+
+ cm->error_resilient_mode = oxcf->error_resilient_mode;
+ cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
+
+ // By default, encoder assumes decoder can use prev_mi.
+ if (cm->error_resilient_mode) {
+ cm->frame_parallel_decoding_mode = 1;
+ cm->reset_frame_context = 0;
+ cm->refresh_frame_context = 0;
+ } else if (cm->intra_only) {
+ // Only reset the current context.
+ cm->reset_frame_context = 2;
+ }
+ }
+
+ if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
+
+ if (oxcf->aq_mode == PERCEPTUAL_AQ) {
+ init_mb_wiener_var_buffer(cpi);
+ set_mb_wiener_variance(cpi);
+ }
+
+ vpx_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+ // Backup to ensure consistency between recodes
+ save_encode_params(cpi);
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+ vpx_codec_err_t codec_status;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+ const int ref_frame_flags = get_ref_frame_flags(cpi);
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+ const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+ // index 1 refers to the first encoding frame in a gf group.
+ // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+ // See function define_gf_group_structure().
+ const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+ int ext_rdmult = VPX_DEFAULT_RDMULT;
+ get_ref_frame_bufs(cpi, ref_frame_bufs);
+ codec_status = vp9_extrc_get_frame_rdmult(
+ &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+ cm->current_frame_coding_index, gf_group->index, update_type,
+ gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+ &ext_rdmult);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_frame_rdmult() failed");
+ }
+ cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
+ }
+
+ if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+ if (!encode_without_recode_loop(cpi, size, dest)) return;
+ } else {
+#if !CONFIG_REALTIME_ONLY
+#if CONFIG_RATE_CTRL
+ encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history);
+#else // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_with_recode_loop_time);
+#endif
+ encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_with_recode_loop_time);
+#endif
+#endif // CONFIG_RATE_CTRL
+#endif // !CONFIG_REALTIME_ONLY
+ }
+
+ // TODO(jingning): When using show existing frame mode, we assume that the
+ // current ARF will be directly used as the final reconstructed frame. This is
+ // an encoder control scheme. One could in principle explore other
+ // possibilities to arrange the reference frame buffer and their coding order.
+ if (cm->show_existing_frame) {
+ ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx,
+ cm->ref_frame_map[cpi->alt_fb_idx]);
+ }
+
+#if !CONFIG_REALTIME_ONLY
+ // Disable segmentation if it decrease rate/distortion ratio
+ if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
+ vp9_try_disable_lookahead_aq(cpi, size, dest);
+#endif
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#ifdef OUTPUT_YUV_DENOISED
+ if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
+ vpx_write_yuv_frame(yuv_denoised_file,
+ &cpi->denoiser.running_avg_y[INTRA_FRAME]);
+ }
+#endif
+#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ vp9_output_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ cpi->ambient_err =
+ vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ } else {
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+ }
+#else
+ cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ // If the encoder forced a KEY_FRAME decision
+ if (cm->frame_type == KEY_FRAME) cpi->refresh_last_frame = 1;
+
+ cm->frame_to_show = get_frame_new_buffer(cm);
+ cm->frame_to_show->color_space = cm->color_space;
+ cm->frame_to_show->color_range = cm->color_range;
+ cm->frame_to_show->render_width = cm->render_width;
+ cm->frame_to_show->render_height = cm->render_height;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loopfilter_frame_time);
+#endif
+ // Pick the loop filter level for the frame.
+ loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loopfilter_frame_time);
+#endif
+
+ if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_pack_bitstream_time);
+#endif
+ // build the bitstream
+ vp9_pack_bitstream(cpi, dest, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_pack_bitstream_time);
+#endif
+
+ if (cpi->ext_ratectrl.ready) {
+ const RefCntBuffer *coded_frame_buf =
+ get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
+ &cpi->ext_ratectrl, (*size) << 3, cpi->Source, &coded_frame_buf->buf,
+ cm->bit_depth, cpi->oxcf.input_bit_depth, cm->base_qindex);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_update_encodeframe_result() failed");
+ }
+ }
+#if CONFIG_REALTIME_ONLY
+ (void)encode_frame_result;
+ assert(encode_frame_result == NULL);
+#else // CONFIG_REALTIME_ONLY
+ if (encode_frame_result != NULL) {
+ const RefCntBuffer *coded_frame_buf =
+ get_ref_cnt_buffer(cm, cm->new_fb_idx);
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+ FRAME_UPDATE_TYPE update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ int quantize_index = vp9_get_quantizer(cpi);
+ get_ref_frame_bufs(cpi, ref_frame_bufs);
+ // update_encode_frame_result() depends on twopass.gf_group.index and
+ // cm->new_fb_idx, cpi->Source, cpi->lst_fb_idx, cpi->gld_fb_idx and
+ // cpi->alt_fb_idx are updated for current frame and have
+ // not been updated for the next frame yet.
+ // The update locations are as follows.
+ // 1) twopass.gf_group.index is initialized at define_gf_group by vp9_zero()
+ // for the first frame in the gf_group and is updated for the next frame at
+ // vp9_twopass_postencode_update().
+ // 2) cpi->Source is updated at the beginning of vp9_get_compressed_data()
+ // 3) cm->new_fb_idx is updated at the beginning of
+ // vp9_get_compressed_data() by get_free_fb(cm).
+ // 4) cpi->lst_fb_idx/gld_fb_idx/alt_fb_idx will be updated for the next
+ // frame at vp9_update_reference_frames().
+ // This function needs to be called before vp9_update_reference_frames().
+ // TODO(angiebird): Improve the codebase to make the update of frame
+ // dependent variables more robust.
+
+ update_encode_frame_result_basic(update_type, coded_frame_buf->frame_index,
+ quantize_index, encode_frame_result);
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ const int ref_frame_flags = get_ref_frame_flags(cpi);
+ update_encode_frame_result_simple_encode(
+ ref_frame_flags,
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+ cpi->Source, coded_frame_buf, ref_frame_bufs, quantize_index,
+ cm->bit_depth, cpi->oxcf.input_bit_depth, cpi->td.counts,
+ cpi->partition_info, cpi->motion_vector_info, cpi->tpl_stats_info,
+ encode_frame_result);
+ }
+#endif // CONFIG_RATE_CTRL
+ }
+#endif // CONFIG_REALTIME_ONLY
+
+ if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality &&
+ cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) {
+ restore_coding_context(cpi);
+ return;
+ }
+
+ cpi->last_frame_dropped = 0;
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0;
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->svc.num_encoded_top_layer++;
+
+ // Keep track of the frame buffer index updated/refreshed for the
+ // current encoded TL0 superframe.
+ if (cpi->svc.temporal_layer_id == 0) {
+ if (cpi->refresh_last_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx;
+ else if (cpi->refresh_golden_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx;
+ else if (cpi->refresh_alt_ref_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx;
+ }
+
+ if (cm->seg.update_map) update_reference_segmentation_map(cpi);
+
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
+ vp9_update_reference_frames(cpi);
+
+ if (!cm->show_existing_frame) {
+ for (t = TX_4X4; t <= TX_32X32; ++t) {
+ full_to_model_counts(cpi->td.counts->coef[t],
+ cpi->td.rd_counts.coef_counts[t]);
+ }
+
+ if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
+ if (!frame_is_intra_only(cm)) {
+ vp9_adapt_mode_probs(cm);
+ vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+ }
+ vp9_adapt_coef_probs(cm);
+ }
+ }
+
+ cpi->ext_refresh_frame_flags_pending = 0;
+
+ if (cpi->refresh_golden_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+ if (cpi->refresh_alt_ref_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+ cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+ cm->last_frame_type = cm->frame_type;
+
+ vp9_rc_postencode_update(cpi, *size);
+
+ if (cpi->compute_frame_low_motion_onepass && oxcf->pass == 0 &&
+ !frame_is_intra_only(cm) &&
+ (!cpi->use_svc ||
+ (cpi->use_svc &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) {
+ vp9_compute_frame_low_motion(cpi);
+ }
+
+ *size = VPXMAX(1, *size);
+
+#if 0
+ output_frame_level_debug_stats(cpi);
+#endif
+
+ if (cm->frame_type == KEY_FRAME) {
+ // Tell the caller that the frame was coded as a key frame
+ *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+ }
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
+
+ // keep track of the last coded dimensions
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+
+ // reset to normal state now that we are done.
+ if (!cm->show_existing_frame) {
+ cm->last_show_frame = cm->show_frame;
+ cm->prev_frame = cm->cur_frame;
+ }
+
+ if (cm->show_frame) {
+ vp9_swap_mi_and_prev_mi(cm);
+ if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
+ }
+ update_frame_indexes(cm, cm->show_frame);
+
+ if (cpi->use_svc) {
+ cpi->svc
+ .layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id]
+ .last_frame_type = cm->frame_type;
+ // Reset layer_sync back to 0 for next frame.
+ cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0;
+ }
+
+ cpi->force_update_segmentation = 0;
+
+#if !CONFIG_REALTIME_ONLY
+ if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
+ vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi);
+#endif
+
+ cpi->svc.previous_frame_is_intra_only = cm->intra_only;
+ cpi->svc.set_intra_only_frame = 0;
+}
+
+static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+ unsigned int *frame_flags) {
+ vp9_rc_get_svc_params(cpi);
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags,
+ /*encode_frame_result = */ NULL);
+}
+
+static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+ unsigned int *frame_flags) {
+ if (cpi->oxcf.rc_mode == VPX_CBR) {
+ vp9_rc_get_one_pass_cbr_params(cpi);
+ } else {
+ vp9_rc_get_one_pass_vbr_params(cpi);
+ }
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags,
+ /*encode_frame_result = */ NULL);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+ unsigned int *frame_flags,
+ ENCODE_FRAME_RESULT *encode_frame_result) {
+ cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_w();
+#endif
+ encode_frame_to_data_rate(cpi, size, dest, frame_flags, encode_frame_result);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ VP9_COMMON *const cm = &cpi->common;
+ struct vpx_usec_timer timer;
+ int res = 0;
+ const int subsampling_x = sd->subsampling_x;
+ const int subsampling_y = sd->subsampling_y;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#else
+ const int use_highbitdepth = 0;
+#endif
+
+ update_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+
+ alloc_raw_frame_buffers(cpi);
+
+ vpx_usec_timer_start(&timer);
+
+ if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+ use_highbitdepth, frame_flags))
+ res = -1;
+ vpx_usec_timer_mark(&timer);
+ cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+ if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
+ (subsampling_x != 1 || subsampling_y != 1)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+ "Non-4:2:0 color format requires profile 1 or 3");
+ res = -1;
+ }
+ if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
+ (subsampling_x == 1 && subsampling_y == 1)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
+ "4:2:0 color format requires profile 0 or 2");
+ res = -1;
+ }
+
+ return res;
+}
+
+static int frame_is_reference(const VP9_COMP *cpi) {
+ const VP9_COMMON *cm = &cpi->common;
+
+ return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+ cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame ||
+ cm->refresh_frame_context || cm->lf.mode_ref_delta_update ||
+ cm->seg.update_map || cm->seg.update_data;
+}
+
+static void adjust_frame_rate(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
+ int64_t this_duration;
+ int step = 0;
+
+ if (source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = source->ts_end - source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+ this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ vp9_new_framerate(cpi, 10000000.0 / this_duration);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval = VPXMIN(
+ (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ vp9_new_framerate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+ cpi->last_time_stamp_seen = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int arf_src_index = 0;
+ if (is_altref_enabled(cpi)) {
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ arf_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ } else if (rc->source_alt_ref_pending) {
+ arf_src_index = rc->frames_till_gf_update_due;
+ }
+ }
+ return arf_src_index;
+}
+
+static void check_src_altref(VP9_COMP *cpi,
+ const struct lookahead_entry *source) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ rc->is_src_frame_alt_ref =
+ (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+ } else {
+ rc->is_src_frame_alt_ref =
+ cpi->alt_ref_source && (source == cpi->alt_ref_source);
+ }
+
+ if (rc->is_src_frame_alt_ref) {
+ // Current frame is an ARF overlay frame.
+ cpi->alt_ref_source = NULL;
+
+ // Don't refresh the last buffer for an ARF overlay frame. It will
+ // become the GF so preserve last as an alternative prediction option.
+ cpi->refresh_last_frame = 0;
+ }
+}
+
+#if CONFIG_INTERNAL_STATS
+static void adjust_image_stat(double y, double u, double v, double all,
+ ImageStat *s) {
+ s->stat[Y] += y;
+ s->stat[U] += u;
+ s->stat[V] += v;
+ s->stat[ALL] += all;
+ s->worst = VPXMIN(s->worst, all);
+}
+#endif // CONFIG_INTERNAL_STATS
+
+// Adjust the maximum allowable frame size for the target level.
+static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ LevelConstraint *const ls = &cpi->level_constraint;
+ VP9_COMMON *const cm = &cpi->common;
+ const double max_cpb_size = ls->max_cpb_size;
+ vpx_clear_system_state();
+ rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size);
+ if (frame_is_intra_only(cm)) {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5));
+ } else if (arf_src_index > 0) {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4));
+ } else {
+ rc->max_frame_bandwidth =
+ VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2));
+ }
+}
+
+static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
+ VP9_COMMON *const cm = &cpi->common;
+ Vp9LevelInfo *const level_info = &cpi->level_info;
+ Vp9LevelSpec *const level_spec = &level_info->level_spec;
+ Vp9LevelStats *const level_stats = &level_info->level_stats;
+ int i, idx;
+ uint64_t luma_samples, dur_end;
+ const uint32_t luma_pic_size = cm->width * cm->height;
+ const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
+ LevelConstraint *const level_constraint = &cpi->level_constraint;
+ const int8_t level_index = level_constraint->level_index;
+ double cpb_data_size;
+
+ vpx_clear_system_state();
+
+ // update level_stats
+ level_stats->total_compressed_size += *size;
+ if (cm->show_frame) {
+ level_stats->total_uncompressed_size +=
+ luma_pic_size +
+ 2 * (luma_pic_size >> (cm->subsampling_x + cm->subsampling_y));
+ level_stats->time_encoded =
+ (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+ (double)TICKS_PER_SEC;
+ }
+
+ if (arf_src_index > 0) {
+ if (!level_stats->seen_first_altref) {
+ level_stats->seen_first_altref = 1;
+ } else if (level_stats->frames_since_last_altref <
+ level_spec->min_altref_distance) {
+ level_spec->min_altref_distance = level_stats->frames_since_last_altref;
+ }
+ level_stats->frames_since_last_altref = 0;
+ } else {
+ ++level_stats->frames_since_last_altref;
+ }
+
+ if (level_stats->frame_window_buffer.len < FRAME_WINDOW_SIZE - 1) {
+ idx = (level_stats->frame_window_buffer.start +
+ level_stats->frame_window_buffer.len++) %
+ FRAME_WINDOW_SIZE;
+ } else {
+ idx = level_stats->frame_window_buffer.start;
+ level_stats->frame_window_buffer.start = (idx + 1) % FRAME_WINDOW_SIZE;
+ }
+ level_stats->frame_window_buffer.buf[idx].ts = cpi->last_time_stamp_seen;
+ level_stats->frame_window_buffer.buf[idx].size = (uint32_t)(*size);
+ level_stats->frame_window_buffer.buf[idx].luma_samples = luma_pic_size;
+
+ if (cm->frame_type == KEY_FRAME) {
+ level_stats->ref_refresh_map = 0;
+ } else {
+ int count = 0;
+ level_stats->ref_refresh_map |= vp9_get_refresh_mask(cpi);
+ // Also need to consider the case where the encoder refers to a buffer
+ // that has been implicitly refreshed after encoding a keyframe.
+ if (!cm->intra_only) {
+ level_stats->ref_refresh_map |= (1 << cpi->lst_fb_idx);
+ level_stats->ref_refresh_map |= (1 << cpi->gld_fb_idx);
+ level_stats->ref_refresh_map |= (1 << cpi->alt_fb_idx);
+ }
+ for (i = 0; i < REF_FRAMES; ++i) {
+ count += (level_stats->ref_refresh_map >> i) & 1;
+ }
+ if (count > level_spec->max_ref_frame_buffers) {
+ level_spec->max_ref_frame_buffers = count;
+ }
+ }
+
+ // update average_bitrate
+ level_spec->average_bitrate = (double)level_stats->total_compressed_size /
+ 125.0 / level_stats->time_encoded;
+
+ // update max_luma_sample_rate
+ luma_samples = 0;
+ for (i = 0; i < level_stats->frame_window_buffer.len; ++i) {
+ idx = (level_stats->frame_window_buffer.start +
+ level_stats->frame_window_buffer.len - 1 - i) %
+ FRAME_WINDOW_SIZE;
+ if (i == 0) {
+ dur_end = level_stats->frame_window_buffer.buf[idx].ts;
+ }
+ if (dur_end - level_stats->frame_window_buffer.buf[idx].ts >=
+ TICKS_PER_SEC) {
+ break;
+ }
+ luma_samples += level_stats->frame_window_buffer.buf[idx].luma_samples;
+ }
+ if (luma_samples > level_spec->max_luma_sample_rate) {
+ level_spec->max_luma_sample_rate = luma_samples;
+ }
+
+ // update max_cpb_size
+ cpb_data_size = 0;
+ for (i = 0; i < CPB_WINDOW_SIZE; ++i) {
+ if (i >= level_stats->frame_window_buffer.len) break;
+ idx = (level_stats->frame_window_buffer.start +
+ level_stats->frame_window_buffer.len - 1 - i) %
+ FRAME_WINDOW_SIZE;
+ cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+ }
+ cpb_data_size = cpb_data_size / 125.0;
+ if (cpb_data_size > level_spec->max_cpb_size) {
+ level_spec->max_cpb_size = cpb_data_size;
+ }
+
+ // update max_luma_picture_size
+ if (luma_pic_size > level_spec->max_luma_picture_size) {
+ level_spec->max_luma_picture_size = luma_pic_size;
+ }
+
+ // update max_luma_picture_breadth
+ if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+ level_spec->max_luma_picture_breadth = luma_pic_breadth;
+ }
+
+ // update compression_ratio
+ level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
+ cm->bit_depth /
+ level_stats->total_compressed_size / 8.0;
+
+ // update max_col_tiles
+ if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) {
+ level_spec->max_col_tiles = (1 << cm->log2_tile_cols);
+ }
+
+ if (level_index >= 0 && level_constraint->fail_flag == 0) {
+ if (level_spec->max_luma_picture_size >
+ vp9_level_defs[level_index].max_luma_picture_size) {
+ level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
+ }
+
+ if (level_spec->max_luma_picture_breadth >
+ vp9_level_defs[level_index].max_luma_picture_breadth) {
+ level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+ }
+
+ if ((double)level_spec->max_luma_sample_rate >
+ (double)vp9_level_defs[level_index].max_luma_sample_rate *
+ (1 + SAMPLE_RATE_GRACE_P)) {
+ level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]);
+ }
+
+ if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) {
+ level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[TOO_MANY_COLUMN_TILE]);
+ }
+
+ if (level_spec->min_altref_distance <
+ vp9_level_defs[level_index].min_altref_distance) {
+ level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[ALTREF_DIST_TOO_SMALL]);
+ }
+
+ if (level_spec->max_ref_frame_buffers >
+ vp9_level_defs[level_index].max_ref_frame_buffers) {
+ level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[TOO_MANY_REF_BUFFER]);
+ }
+
+ if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) {
+ level_constraint->fail_flag |= (1 << CPB_TOO_LARGE);
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Failed to encode to the target level %d. %s",
+ vp9_level_defs[level_index].level,
+ level_fail_messages[CPB_TOO_LARGE]);
+ }
+
+ // Set an upper bound for the next frame size. It will be used in
+ // level_rc_framerate() before encoding the next frame.
+ cpb_data_size = 0;
+ for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) {
+ if (i >= level_stats->frame_window_buffer.len) break;
+ idx = (level_stats->frame_window_buffer.start +
+ level_stats->frame_window_buffer.len - 1 - i) %
+ FRAME_WINDOW_SIZE;
+ cpb_data_size += level_stats->frame_window_buffer.buf[idx].size;
+ }
+ cpb_data_size = cpb_data_size / 125.0;
+ level_constraint->max_frame_size =
+ (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) *
+ 1000.0);
+ if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1)
+ level_constraint->max_frame_size >>= 1;
+ }
+}
+
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+ int *ref_frame_coding_indexes,
+ int *ref_frame_valid_list) {
+ if (update_type != KF_UPDATE) {
+ const VP9_REFFRAME inter_ref_flags[MAX_INTER_REF_FRAMES] = { VP9_LAST_FLAG,
+ VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int i;
+ for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+ assert(ref_frame_bufs[i] != NULL);
+ ref_frame_coding_indexes[i] = ref_frame_bufs[i]->frame_coding_index;
+ ref_frame_valid_list[i] = (ref_frame_flags & inter_ref_flags[i]) != 0;
+ }
+ } else {
+ // No reference frame is available when this is a key frame.
+ int i;
+ for (i = 0; i < MAX_INTER_REF_FRAMES; ++i) {
+ ref_frame_coding_indexes[i] = -1;
+ ref_frame_valid_list[i] = 0;
+ }
+ }
+}
+
+void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result) {
+ encode_frame_result->show_idx = -1; // Actual encoding doesn't happen.
+#if CONFIG_RATE_CTRL
+ encode_frame_result->frame_coding_index = -1;
+ vp9_zero(encode_frame_result->coded_frame);
+ encode_frame_result->coded_frame.allocated = 0;
+ init_rq_history(&encode_frame_result->rq_history);
+#endif // CONFIG_RATE_CTRL
+}
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush,
+ ENCODE_FRAME_RESULT *encode_frame_result) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ VP9_COMMON *const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ RATE_CONTROL *const rc = &cpi->rc;
+ struct vpx_usec_timer cmptimer;
+ YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+ struct lookahead_entry *last_source = NULL;
+ struct lookahead_entry *source = NULL;
+ int arf_src_index;
+ const int gf_group_index = cpi->twopass.gf_group.index;
+ int i;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
+#endif
+
+ if (is_one_pass_svc(cpi)) {
+ vp9_one_pass_svc_start_layer(cpi);
+ }
+
+ vpx_usec_timer_start(&cmptimer);
+
+ vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
+
+ // Is multi-arf enabled.
+ // Note that at the moment multi_arf is only configured for 2 pass VBR and
+ // will not work properly with svc.
+ // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf"
+ // is greater than or equal to 2.
+ if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2))
+ cpi->multi_layer_arf = 1;
+ else
+ cpi->multi_layer_arf = 0;
+
+ // Normal defaults
+ cm->reset_frame_context = 0;
+ cm->refresh_frame_context = 1;
+ if (!is_one_pass_svc(cpi)) {
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ }
+
+ // Should we encode an arf frame.
+ arf_src_index = get_arf_src_index(cpi);
+
+ if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == VPX_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ // Clear arf index stack before group of pictures processing starts.
+ if (gf_group_index == 1) {
+ stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2);
+ cpi->twopass.gf_group.stack_size = 0;
+ }
+
+ if (arf_src_index) {
+ assert(arf_src_index <= rc->frames_to_key);
+ if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+ cpi->alt_ref_source = source;
+
+#if !CONFIG_REALTIME_ONLY
+ if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
+ (oxcf->arnr_strength > 0)) {
+ int bitrate = cpi->rc.avg_frame_bandwidth / 40;
+ int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY;
+
+ int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
+ not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_temporal_filter_time);
+#endif
+ // Produce the filtered ARF frame.
+ vp9_temporal_filter(cpi, arf_src_index);
+ vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_temporal_filter_time);
+#endif
+
+ // for small bitrates segmentation overhead usually
+ // eats all bitrate gain from enabling delta quantizers
+ if (cpi->oxcf.alt_ref_aq != 0 && not_low_bitrate && not_last_frame)
+ vp9_alt_ref_aq_setup_mode(cpi->alt_ref_aq, cpi);
+
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+#endif
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+ cpi->refresh_alt_ref_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_last_frame = 0;
+ rc->is_src_frame_alt_ref = 0;
+ rc->source_alt_ref_pending = 0;
+ } else {
+ rc->source_alt_ref_pending = 0;
+ }
+ }
+
+ if (!source) {
+ // Get last frame source.
+ if (cm->current_video_frame > 0) {
+ if ((last_source = vp9_lookahead_peek(cpi->lookahead, -1)) == NULL)
+ return -1;
+ }
+
+ // Read in the source frame.
+ if (cpi->use_svc || cpi->svc.set_intra_only_frame)
+ source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
+ else
+ source = vp9_lookahead_pop(cpi->lookahead, flush);
+
+ if (source != NULL) {
+ cm->show_frame = 1;
+ cm->intra_only = 0;
+ // If the flags indicate intra frame, but if the current picture is for
+ // spatial layer above first_spatial_layer_to_encode, it should not be an
+ // intra picture.
+ if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc &&
+ cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
+ source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+ }
+
+ // Check to see if the frame should be encoded as an arf overlay.
+ check_src_altref(cpi, source);
+ }
+ }
+
+ if (source) {
+ cpi->un_scaled_source = cpi->Source =
+ force_src_buffer ? force_src_buffer : &source->img;
+
+#ifdef ENABLE_KF_DENOISE
+ // Copy of raw source for metrics calculation.
+ if (is_psnr_calc_enabled(cpi))
+ vp9_copy_and_extend_frame(cpi->Source, &cpi->raw_unscaled_source);
+#endif
+
+ cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+ } else {
+ *size = 0;
+ return -1;
+ }
+
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
+ }
+
+ // Clear down mmx registers
+ vpx_clear_system_state();
+
+ // adjust frame rates based on timestamps given
+ if (cm->show_frame) {
+ if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+ cpi->svc.duration[cpi->svc.spatial_layer_id] > 0)
+ vp9_svc_adjust_frame_rate(cpi);
+ else
+ adjust_frame_rate(cpi, source);
+ }
+
+ if (is_one_pass_svc(cpi)) {
+ vp9_update_temporal_layer_framerate(cpi);
+ vp9_restore_layer_context(cpi);
+ }
+
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (cm->new_fb_idx == INVALID_IDX) return -1;
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+ // If the frame buffer for current frame is the same as previous frame, MV in
+ // the base layer shouldn't be used as it'll cause data race.
+ if (cpi->svc.spatial_layer_id > 0 && cm->cur_frame == cm->prev_frame) {
+ cpi->svc.use_base_mv = 0;
+ }
+ // Start with a 0 size frame.
+ *size = 0;
+
+ cpi->frame_flags = *frame_flags;
+
+#if !CONFIG_REALTIME_ONLY
+ if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
+ vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
+ } else if (oxcf->pass == 1) {
+ set_frame_size(cpi);
+ }
+#endif // !CONFIG_REALTIME_ONLY
+
+ if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
+ cpi->level_constraint.fail_flag == 0)
+ level_rc_framerate(cpi, arf_src_index);
+
+ if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
+ for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+
+ if (cpi->kmeans_data_arr_alloc == 0) {
+ const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_init(&cpi->kmeans_mutex, NULL);
+#endif
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->kmeans_data_arr,
+ vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
+ cpi->kmeans_data_stride = mi_cols;
+ cpi->kmeans_data_arr_alloc = 1;
+ }
+
+#if CONFIG_NON_GREEDY_MV
+ {
+ const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+ Status status = vp9_alloc_motion_field_info(
+ &cpi->motion_field_info, MAX_ARF_GOP_SIZE, mi_rows, mi_cols);
+ if (status == STATUS_FAILED) {
+ vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,
+ "vp9_alloc_motion_field_info failed");
+ }
+ }
+#endif // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, setup_tpl_stats_time);
+#endif
+ if (gf_group_index == 1 &&
+ cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
+ cpi->sf.enable_tpl_model) {
+ vp9_init_tpl_buffer(cpi);
+ vp9_estimate_qp_gop(cpi);
+ vp9_setup_tpl_stats(cpi);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, setup_tpl_stats_time);
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+ assert(cpi->oxcf.max_threads == 0 &&
+ "bitstream debug tool does not support multithreading");
+ bitstream_queue_record_write();
+#endif
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+ bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+ cpi->td.mb.fp_src_pred = 0;
+#if CONFIG_REALTIME_ONLY
+ (void)encode_frame_result;
+ if (cpi->use_svc) {
+ SvcEncode(cpi, size, dest, frame_flags);
+ } else {
+ // One pass encode
+ Pass0Encode(cpi, size, dest, frame_flags);
+ }
+#else // !CONFIG_REALTIME_ONLY
+ if (oxcf->pass == 1 && !cpi->use_svc) {
+ const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.use_highbitdepth)
+ cpi->td.mb.fwd_txfm4x4 =
+ lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+ else
+ cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+ cpi->td.mb.highbd_inv_txfm_add =
+ lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
+#else
+ cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+ vp9_first_pass(cpi, source);
+ } else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // Accumulate 2nd pass time in 2-pass case.
+ start_timing(cpi, Pass2Encode_time);
+#endif
+ Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result);
+ vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, Pass2Encode_time);
+#endif
+ } else if (cpi->use_svc) {
+ SvcEncode(cpi, size, dest, frame_flags);
+ } else {
+ // One pass encode
+ Pass0Encode(cpi, size, dest, frame_flags);
+ }
+#endif // CONFIG_REALTIME_ONLY
+
+ if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx;
+
+ if (cm->refresh_frame_context)
+ cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+
+ // No frame encoded, or frame was dropped, release scaled references.
+ if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+ release_scaled_references(cpi);
+ }
+
+ if (*size > 0) {
+ cpi->droppable = !frame_is_reference(cpi);
+ }
+
+ // Save layer specific state.
+ if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) &&
+ oxcf->pass == 2)) {
+ vp9_save_layer_context(cpi);
+ }
+
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ cpi->fixed_qp_onepass = 0;
+
+ vpx_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->keep_level_stats && oxcf->pass != 1)
+ update_level_info(cpi, size, arf_src_index);
+
+#if CONFIG_INTERNAL_STATS
+
+ if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
+ double samples = 0.0;
+ cpi->bytes += (int)(*size);
+
+ if (cm->show_frame) {
+ uint32_t bit_depth = 8;
+ uint32_t in_bit_depth = 8;
+ cpi->count++;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ in_bit_depth = cpi->oxcf.input_bit_depth;
+ bit_depth = cm->bit_depth;
+ }
+#endif
+
+ if (cpi->b_calculate_psnr) {
+ YV12_BUFFER_CONFIG *orig = cpi->raw_source_frame;
+ YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
+ PSNR_STATS psnr;
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
+ in_bit_depth);
+#else
+ vpx_calc_psnr(orig, recon, &psnr);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+ psnr.psnr[0], &cpi->psnr);
+ cpi->total_sq_error += psnr.sse[0];
+ cpi->total_samples += psnr.samples[0];
+ samples = psnr.samples[0];
+
+ {
+ PSNR_STATS psnr2;
+ double frame_ssim2 = 0, weight = 0;
+#if CONFIG_VP9_POSTPROC
+ if (vpx_alloc_frame_buffer(
+ pp, recon->y_crop_width, recon->y_crop_height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment) < 0) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate post processing buffer");
+ }
+ {
+ vp9_ppflags_t ppflags;
+ ppflags.post_proc_flag = VP9D_DEBLOCK;
+ ppflags.deblocking_level = 0; // not used in vp9_post_proc_frame()
+ ppflags.noise_level = 0; // not used in vp9_post_proc_frame()
+ vp9_post_proc_frame(cm, pp, &ppflags,
+ cpi->un_scaled_source->y_width);
+ }
+#endif
+ vpx_clear_system_state();
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
+ cpi->oxcf.input_bit_depth);
+#else
+ vpx_calc_psnr(orig, pp, &psnr2);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ cpi->totalp_sq_error += psnr2.sse[0];
+ cpi->totalp_samples += psnr2.samples[0];
+ adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+ psnr2.psnr[0], &cpi->psnrp);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight, bit_depth,
+ in_bit_depth);
+ } else {
+ frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+ }
+#else
+ frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ cpi->worst_ssim = VPXMIN(cpi->worst_ssim, frame_ssim2);
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ frame_ssim2 = vpx_highbd_calc_ssim(orig, pp, &weight, bit_depth,
+ in_bit_depth);
+ } else {
+ frame_ssim2 = vpx_calc_ssim(orig, pp, &weight);
+ }
+#else
+ frame_ssim2 = vpx_calc_ssim(orig, pp, &weight);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ cpi->summedp_quality += frame_ssim2 * weight;
+ cpi->summedp_weights += weight;
+#if 0
+ if (cm->show_frame) {
+ FILE *f = fopen("q_used.stt", "a");
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cpi->common.current_video_frame, psnr2.psnr[1],
+ psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ }
+ if (cpi->b_calculate_blockiness) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!cm->use_highbitdepth)
+#endif
+ {
+ double frame_blockiness = vp9_get_blockiness(
+ cpi->Source->y_buffer, cpi->Source->y_stride,
+ cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+ cpi->Source->y_width, cpi->Source->y_height);
+ cpi->worst_blockiness =
+ VPXMAX(cpi->worst_blockiness, frame_blockiness);
+ cpi->total_blockiness += frame_blockiness;
+ }
+ }
+
+ if (cpi->b_calculate_consistency) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (!cm->use_highbitdepth)
+#endif
+ {
+ double this_inconsistency = vpx_get_ssim_metrics(
+ cpi->Source->y_buffer, cpi->Source->y_stride,
+ cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+ cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+ &cpi->metrics, 1);
+
+ const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+ double consistency =
+ vpx_sse_to_psnr(samples, peak, (double)cpi->total_inconsistency);
+ if (consistency > 0.0)
+ cpi->worst_consistency =
+ VPXMIN(cpi->worst_consistency, consistency);
+ cpi->total_inconsistency += this_inconsistency;
+ }
+ }
+
+ {
+ double y, u, v, frame_all;
+ frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+ &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+ }
+ {
+ double y, u, v, frame_all;
+ frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v,
+ bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+ }
+ }
+ }
+
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
+
+ // Print out timing information.
+ // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+ // show_existing_frame and lag-in-frames.
+ // if (cpi->frame_component_time[0] > 100)
+ if (oxcf->pass == 2) {
+ uint64_t frame_total = 0, total = 0;
+ int i;
+
+ fprintf(stderr,
+ "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+ cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+ cm->show_frame, cm->base_qindex);
+ for (i = 0; i < kTimingComponents; i++) {
+ cpi->component_time[i] += cpi->frame_component_time[i];
+ // Use vp9_get_compressed_data_time (i = 0) as the total time.
+ if (i == 0) {
+ frame_total = cpi->frame_component_time[0];
+ total = cpi->component_time[0];
+ }
+ fprintf(stderr,
+ " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+ " us [%6.2f%%])\n",
+ get_component_name(i), cpi->frame_component_time[i],
+ (float)((float)cpi->frame_component_time[i] * 100.0 /
+ (float)frame_total),
+ cpi->component_time[i],
+ (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+ cpi->frame_component_time[i] = 0;
+ }
+ }
+#endif
+
+ if (is_one_pass_svc(cpi)) {
+ if (cm->show_frame) {
+ ++cpi->svc.spatial_layer_to_encode;
+ if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
+ cpi->svc.spatial_layer_to_encode = 0;
+ }
+ }
+
+ vpx_clear_system_state();
+ return 0;
+}
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *flags) {
+ VP9_COMMON *cm = &cpi->common;
+#if !CONFIG_VP9_POSTPROC
+ (void)flags;
+#endif
+
+ if (!cm->show_frame) {
+ return -1;
+ } else {
+ int ret;
+#if CONFIG_VP9_POSTPROC
+ ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width);
+#else
+ if (cm->frame_to_show) {
+ *dest = *cm->frame_to_show;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
+ dest->uv_width = cm->width >> cm->subsampling_x;
+ dest->uv_height = cm->height >> cm->subsampling_y;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+#endif // !CONFIG_VP9_POSTPROC
+ vpx_clear_system_state();
+ return ret;
+ }
+}
+
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+ VPX_SCALING_MODE vert_mode) {
+ VP9_COMMON *cm = &cpi->common;
+ int hr = 0, hs = 0, vr = 0, vs = 0;
+
+ if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
+
+ Scale2Ratio(horiz_mode, &hr, &hs);
+ Scale2Ratio(vert_mode, &vr, &vs);
+
+ // always go to the next whole number
+ cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+ cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+ if (cm->current_video_frame) {
+ assert(cm->width <= cpi->initial_width);
+ assert(cm->height <= cpi->initial_height);
+ }
+
+ update_frame_size(cpi);
+
+ return 0;
+}
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+ unsigned int height) {
+ VP9_COMMON *cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+ update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+ cpi->common.subsampling_y);
+#else
+ update_initial_width(cpi, 0, cpi->common.subsampling_x,
+ cpi->common.subsampling_y);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ setup_denoiser_buffer(cpi);
+#endif
+ alloc_raw_frame_buffers(cpi);
+ if (width) {
+ cm->width = width;
+ if (cm->width > cpi->initial_width) {
+ cm->width = cpi->initial_width;
+ printf("Warning: Desired width too large, changed to %d\n", cm->width);
+ }
+ }
+
+ if (height) {
+ cm->height = height;
+ if (cm->height > cpi->initial_height) {
+ cm->height = cpi->initial_height;
+ printf("Warning: Desired height too large, changed to %d\n", cm->height);
+ }
+ }
+ assert(cm->width <= cpi->initial_width);
+ assert(cm->height <= cpi->initial_height);
+
+ update_frame_size(cpi);
+
+ return 0;
+}
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
+ cpi->use_svc = use_svc;
+ return;
+}
+
+int vp9_get_quantizer(const VP9_COMP *cpi) { return cpi->common.base_qindex; }
+
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags) {
+ if (flags &
+ (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF)) {
+ int ref = 7;
+
+ if (flags & VP8_EFLAG_NO_REF_LAST) ref ^= VP9_LAST_FLAG;
+
+ if (flags & VP8_EFLAG_NO_REF_GF) ref ^= VP9_GOLD_FLAG;
+
+ if (flags & VP8_EFLAG_NO_REF_ARF) ref ^= VP9_ALT_FLAG;
+
+ vp9_use_as_reference(cpi, ref);
+ }
+
+ if (flags &
+ (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+ VP8_EFLAG_FORCE_GF | VP8_EFLAG_FORCE_ARF)) {
+ int upd = 7;
+
+ if (flags & VP8_EFLAG_NO_UPD_LAST) upd ^= VP9_LAST_FLAG;
+
+ if (flags & VP8_EFLAG_NO_UPD_GF) upd ^= VP9_GOLD_FLAG;
+
+ if (flags & VP8_EFLAG_NO_UPD_ARF) upd ^= VP9_ALT_FLAG;
+
+ vp9_update_reference(cpi, upd);
+ }
+
+ if (flags & VP8_EFLAG_NO_UPD_ENTROPY) {
+ vp9_update_entropy(cpi, 0);
+ }
+}
+
+void vp9_set_row_mt(VP9_COMP *cpi) {
+ // Enable row based multi-threading for supported modes of encoding
+ cpi->row_mt = 0;
+ if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+ cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+ cpi->oxcf.row_mt && !cpi->use_svc)
+ cpi->row_mt = 1;
+
+ if (cpi->oxcf.mode == GOOD && cpi->oxcf.speed < 5 &&
+ (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) && cpi->oxcf.row_mt &&
+ !cpi->use_svc)
+ cpi->row_mt = 1;
+
+ // In realtime mode, enable row based multi-threading for all the speed levels
+ // where non-rd path is used.
+ if (cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->oxcf.row_mt) {
+ cpi->row_mt = 1;
+ }
+
+ if (cpi->row_mt)
+ cpi->row_mt_bit_exact = 1;
+ else
+ cpi->row_mt_bit_exact = 0;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
new file mode 100644
index 0000000000..2528bc2316
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@@ -0,0 +1,1642 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_
+#define VPX_VP9_ENCODER_VP9_ENCODER_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vp8cx.h"
+#if CONFIG_INTERNAL_STATS
+#include "vpx_dsp/ssim.h"
+#endif
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_timestamp.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_ppflags.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_alt_ref_aq.h"
+#endif
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_job_queue.h"
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// vp9 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000
+
+typedef struct {
+ int nmvjointcost[MV_JOINTS];
+ int nmvcosts[2][MV_VALS];
+ int nmvcosts_hp[2][MV_VALS];
+
+ vpx_prob segment_pred_probs[PREDICTION_PROBS];
+
+ unsigned char *last_frame_seg_map_copy;
+
+ // 0 = Intra, Last, GF, ARF
+ signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS];
+ // 0 = ZERO_MV, MV
+ signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
+
+ FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+typedef enum {
+ // encode_breakout is disabled.
+ ENCODE_BREAKOUT_DISABLED = 0,
+ // encode_breakout is enabled.
+ ENCODE_BREAKOUT_ENABLED = 1,
+ // encode_breakout is enabled with small max_thresh limit.
+ ENCODE_BREAKOUT_LIMITED = 2
+} ENCODE_BREAKOUT_TYPE;
+
+typedef enum {
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD,
+
+ // The encoder places priority on the quality of the output over encoding
+ // speed. The output is compressed at the highest possible quality. This
+ // option takes the longest amount of time to encode. Speed setting ignored.
+ BEST,
+
+ // Realtime/Live Encoding. This mode is optimized for realtime encoding (for
+ // example, capturing a television signal or feed from a live camera). Speed
+ // setting controls how fast.
+ REALTIME
+} MODE;
+
+typedef enum {
+ FRAMEFLAGS_KEY = 1 << 0,
+ FRAMEFLAGS_GOLDEN = 1 << 1,
+ FRAMEFLAGS_ALTREF = 1 << 2,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+ NO_AQ = 0,
+ VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
+ CYCLIC_REFRESH_AQ = 3,
+ EQUATOR360_AQ = 4,
+ PERCEPTUAL_AQ = 5,
+ PSNR_AQ = 6,
+ // AQ based on lookahead temporal
+ // variance (only valid for altref frames)
+ LOOKAHEAD_AQ = 7,
+ AQ_MODE_COUNT // This should always be the last member of the enum
+} AQ_MODE;
+
+typedef enum {
+ RESIZE_NONE = 0, // No frame resizing allowed (except for SVC).
+ RESIZE_FIXED = 1, // All frames are coded at the specified dimension.
+ RESIZE_DYNAMIC = 2 // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
+
+typedef enum {
+ kInvalid = 0,
+ kLowSadLowSumdiff = 1,
+ kLowSadHighSumdiff = 2,
+ kHighSadLowSumdiff = 3,
+ kHighSadHighSumdiff = 4,
+ kLowVarHighSumdiff = 5,
+ kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
+typedef enum {
+ LOOPFILTER_ALL = 0,
+ LOOPFILTER_REFERENCE = 1, // Disable loopfilter on non reference frames.
+ NO_LOOPFILTER = 2, // Disable loopfilter on all frames.
+} LOOPFILTER_CONTROL;
+
+typedef struct VP9EncoderConfig {
+ BITSTREAM_PROFILE profile;
+ vpx_bit_depth_t bit_depth; // Codec bit-depth.
+ int width; // width of data passed to the compressor
+ int height; // height of data passed to the compressor
+ unsigned int input_bit_depth; // Input bit depth.
+ double init_framerate; // set to passed in framerate
+ vpx_rational_t g_timebase; // equivalent to g_timebase in vpx_codec_enc_cfg_t
+ vpx_rational64_t g_timebase_in_ts; // g_timebase * TICKS_PER_SEC
+
+ int64_t target_bandwidth; // bandwidth to be used in bits per second
+
+ int noise_sensitivity; // pre processing blur: recommendation 0
+ int sharpness; // sharpening output: recommendation 0:
+ int speed;
+ // maximum allowed bitrate for any intra frame in % of bitrate target.
+ unsigned int rc_max_intra_bitrate_pct;
+ // maximum allowed bitrate for any inter frame in % of bitrate target.
+ unsigned int rc_max_inter_bitrate_pct;
+ // percent of rate boost for golden frame in CBR mode.
+ unsigned int gf_cbr_boost_pct;
+
+ MODE mode;
+ int pass;
+
+ // Key Framing Operations
+ int auto_key; // autodetect cut scenes and set the keyframes
+ int key_freq; // maximum distance to key frame.
+
+ int lag_in_frames; // how many frames lag before we start encoding
+
+ // ----------------------------------------------------------------
+ // DATARATE CONTROL OPTIONS
+
+ // vbr, cbr, constrained quality or constant quality
+ enum vpx_rc_mode rc_mode;
+
+ // buffer targeting aggressiveness
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ // buffering parameters
+ int64_t starting_buffer_level_ms;
+ int64_t optimal_buffer_level_ms;
+ int64_t maximum_buffer_size_ms;
+
+ // Frame drop threshold.
+ int drop_frames_water_mark;
+
+ // controlling quality
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+ AQ_MODE aq_mode; // Adaptive Quantization mode
+
+ // Special handling of Adaptive Quantization for AltRef frames
+ int alt_ref_aq;
+
+ // Internal frame size scaling.
+ RESIZE_TYPE resize_mode;
+ int scaled_frame_width;
+ int scaled_frame_height;
+
+ // Enable feature to reduce the frame quantization every x frames.
+ int frame_periodic_boost;
+
+ // two pass datarate control
+ int two_pass_vbrbias; // two pass datarate control tweaks
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+ int vbr_corpus_complexity; // 0 indicates corpus vbr disabled
+ // END DATARATE CONTROL OPTIONS
+ // ----------------------------------------------------------------
+
+ // Spatial and temporal scalability.
+ int ss_number_layers; // Number of spatial layers.
+ int ts_number_layers; // Number of temporal layers.
+ // Bitrate allocation for spatial layers.
+ int layer_target_bitrate[VPX_MAX_LAYERS];
+ int ss_target_bitrate[VPX_SS_MAX_LAYERS];
+ int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
+ // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
+ int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+
+ int enable_auto_arf;
+
+ int encode_breakout; // early breakout : for video conf recommend 800
+
+ /* Bitfield defining the error resiliency features to enable.
+ * Can provide decodable frames after losses in previous
+ * frames and decodable partitions after losses in the same frame.
+ */
+ unsigned int error_resilient_mode;
+
+ /* Bitfield defining the parallel decoding mode where the
+ * decoding in successive frames may be conducted in parallel
+ * just by decoding the frame headers.
+ */
+ unsigned int frame_parallel_decoding_mode;
+
+ int arnr_max_frames;
+ int arnr_strength;
+
+ int min_gf_interval;
+ int max_gf_interval;
+
+ int tile_columns;
+ int tile_rows;
+
+ int enable_tpl_model;
+
+ int max_threads;
+
+ unsigned int target_level;
+
+ vpx_fixed_buf_t two_pass_stats_in;
+
+ vp8e_tuning tuning;
+ vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth;
+#endif
+ vpx_color_space_t color_space;
+ vpx_color_range_t color_range;
+ int render_width;
+ int render_height;
+ VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+
+ int row_mt;
+ unsigned int motion_vector_unit_test;
+ int delta_q_uv;
+ int use_simple_encode_api; // Use SimpleEncode APIs or not
+} VP9EncoderConfig;
+
+static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
+ return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+typedef struct TplDepStats {
+ int64_t intra_cost;
+ int64_t inter_cost;
+ int64_t mc_flow;
+ int64_t mc_dep_cost;
+ int64_t mc_ref_cost;
+
+ int ref_frame_index;
+ int_mv mv;
+} TplDepStats;
+
+#if CONFIG_NON_GREEDY_MV
+
+#define ZERO_MV_MODE 0
+#define NEW_MV_MODE 1
+#define NEAREST_MV_MODE 2
+#define NEAR_MV_MODE 3
+#define MAX_MV_MODE 4
+#endif
+
+typedef struct TplDepFrame {
+ uint8_t is_valid;
+ TplDepStats *tpl_stats_ptr;
+ int stride;
+ int width;
+ int height;
+ int mi_rows;
+ int mi_cols;
+ int base_qindex;
+#if CONFIG_NON_GREEDY_MV
+ int lambda;
+ int *mv_mode_arr[3];
+ double *rd_diff_arr[3];
+#endif
+} TplDepFrame;
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+ int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
+ int8_t mode_map[BLOCK_SIZES][MAX_MODES];
+ FIRSTPASS_DATA fp_data;
+ VP9RowMTSync row_mt_sync;
+
+ // Used for adaptive_rd_thresh with row multithreading
+ int *row_base_thresh_freq_fact;
+} TileDataEnc;
+
+typedef struct RowMTInfo {
+ JobQueueHandle job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t job_mutex;
+#endif
+} RowMTInfo;
+
+typedef struct {
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+ unsigned int count;
+} TOKENLIST;
+
+typedef struct MultiThreadHandle {
+ int allocated_tile_rows;
+ int allocated_tile_cols;
+ int allocated_vert_unit_rows;
+
+ // Frame level params
+ int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
+
+ // Job Queue structure and handles
+ JobQueue *job_queue;
+
+ int jobs_per_tile_col;
+
+ RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
+ int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles
+} MultiThreadHandle;
+
+typedef struct RD_COUNTS {
+ vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+ int64_t comp_pred_diff[REFERENCE_MODES];
+ int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+
+ PICK_MODE_CONTEXT *leaf_tree;
+ PC_TREE *pc_tree;
+ PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+typedef enum { Y, U, V, ALL } STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+ double stat[ALL + 1];
+ double worst;
+} ImageStat;
+
+// Kf noise filtering currently disabled by default in build.
+// #define ENABLE_KF_DENOISE 1
+
+#define CPB_WINDOW_SIZE 4
+#define FRAME_WINDOW_SIZE 128
+#define SAMPLE_RATE_GRACE_P 0.015
+#define VP9_LEVELS 14
+
+typedef enum {
+ LEVEL_UNKNOWN = 0,
+ LEVEL_AUTO = 1,
+ LEVEL_1 = 10,
+ LEVEL_1_1 = 11,
+ LEVEL_2 = 20,
+ LEVEL_2_1 = 21,
+ LEVEL_3 = 30,
+ LEVEL_3_1 = 31,
+ LEVEL_4 = 40,
+ LEVEL_4_1 = 41,
+ LEVEL_5 = 50,
+ LEVEL_5_1 = 51,
+ LEVEL_5_2 = 52,
+ LEVEL_6 = 60,
+ LEVEL_6_1 = 61,
+ LEVEL_6_2 = 62,
+ LEVEL_MAX = 255
+} VP9_LEVEL;
+
+typedef struct {
+ VP9_LEVEL level;
+ uint64_t max_luma_sample_rate;
+ uint32_t max_luma_picture_size;
+ uint32_t max_luma_picture_breadth;
+ double average_bitrate; // in kilobits per second
+ double max_cpb_size; // in kilobits
+ double compression_ratio;
+ uint8_t max_col_tiles;
+ uint32_t min_altref_distance;
+ uint8_t max_ref_frame_buffers;
+} Vp9LevelSpec;
+
+extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS];
+
+typedef struct {
+ int64_t ts; // timestamp
+ uint32_t luma_samples;
+ uint32_t size; // in bytes
+} FrameRecord;
+
+typedef struct {
+ FrameRecord buf[FRAME_WINDOW_SIZE];
+ uint8_t start;
+ uint8_t len;
+} FrameWindowBuffer;
+
+typedef struct {
+ uint8_t seen_first_altref;
+ uint32_t frames_since_last_altref;
+ uint64_t total_compressed_size;
+ uint64_t total_uncompressed_size;
+ double time_encoded; // in seconds
+ FrameWindowBuffer frame_window_buffer;
+ int ref_refresh_map;
+} Vp9LevelStats;
+
+typedef struct {
+ Vp9LevelStats level_stats;
+ Vp9LevelSpec level_spec;
+} Vp9LevelInfo;
+
+typedef enum {
+ BITRATE_TOO_LARGE = 0,
+ LUMA_PIC_SIZE_TOO_LARGE,
+ LUMA_PIC_BREADTH_TOO_LARGE,
+ LUMA_SAMPLE_RATE_TOO_LARGE,
+ CPB_TOO_LARGE,
+ COMPRESSION_RATIO_TOO_SMALL,
+ TOO_MANY_COLUMN_TILE,
+ ALTREF_DIST_TOO_SMALL,
+ TOO_MANY_REF_BUFFER,
+ TARGET_LEVEL_FAIL_IDS
+} TARGET_LEVEL_FAIL_ID;
+
+typedef struct {
+ int8_t level_index;
+ uint8_t fail_flag;
+ int max_frame_size; // in bits
+ double max_cpb_size; // in bits
+} LevelConstraint;
+
+typedef struct ARNRFilterData {
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];
+ int strength;
+ int frame_count;
+ int alt_ref_index;
+ struct scale_factors sf;
+} ARNRFilterData;
+
+typedef struct EncFrameBuf {
+ int mem_valid;
+ int released;
+ YV12_BUFFER_CONFIG frame;
+} EncFrameBuf;
+
+// Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
+#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
+#define MAX_KMEANS_GROUPS 8
+
+typedef struct KMEANS_DATA {
+ double value;
+ int pos;
+ int group_idx;
+} KMEANS_DATA;
+
+#if CONFIG_RATE_CTRL
+typedef struct PARTITION_INFO {
+ int row; // row pixel offset of current 4x4 block
+ int column; // column pixel offset of current 4x4 block
+ int row_start; // row pixel offset of the start of the prediction block
+ int column_start; // column pixel offset of the start of the prediction block
+ int width; // prediction block width
+ int height; // prediction block height
+} PARTITION_INFO;
+
+typedef struct MOTION_VECTOR_INFO {
+ MV_REFERENCE_FRAME ref_frame[2];
+ int_mv mv[2];
+} MOTION_VECTOR_INFO;
+
+typedef struct GOP_COMMAND {
+ int use; // use this command to set gop or not. If not, use vp9's decision.
+ int show_frame_count;
+ int use_alt_ref;
+} GOP_COMMAND;
+
+static INLINE void gop_command_on(GOP_COMMAND *gop_command,
+ int show_frame_count, int use_alt_ref) {
+ gop_command->use = 1;
+ gop_command->show_frame_count = show_frame_count;
+ gop_command->use_alt_ref = use_alt_ref;
+}
+
+static INLINE void gop_command_off(GOP_COMMAND *gop_command) {
+ gop_command->use = 0;
+ gop_command->show_frame_count = 0;
+ gop_command->use_alt_ref = 0;
+}
+
+static INLINE int gop_command_coding_frame_count(
+ const GOP_COMMAND *gop_command) {
+ if (gop_command->use == 0) {
+ assert(0);
+ return -1;
+ }
+ return gop_command->show_frame_count + gop_command->use_alt_ref;
+}
+
+// TODO(angiebird): See if we can merge this one with FrameType in
+// simple_encode.h
+typedef enum ENCODE_FRAME_TYPE {
+ ENCODE_FRAME_TYPE_KEY,
+ ENCODE_FRAME_TYPE_INTER,
+ ENCODE_FRAME_TYPE_ALTREF,
+ ENCODE_FRAME_TYPE_OVERLAY,
+ ENCODE_FRAME_TYPE_GOLDEN,
+ ENCODE_FRAME_TYPES,
+} ENCODE_FRAME_TYPE;
+
+// TODO(angiebird): Merge this function with get_frame_type_from_update_type()
+static INLINE ENCODE_FRAME_TYPE
+get_encode_frame_type(FRAME_UPDATE_TYPE update_type) {
+ switch (update_type) {
+ case KF_UPDATE: return ENCODE_FRAME_TYPE_KEY;
+ case ARF_UPDATE: return ENCODE_FRAME_TYPE_ALTREF;
+ case GF_UPDATE: return ENCODE_FRAME_TYPE_GOLDEN;
+ case OVERLAY_UPDATE: return ENCODE_FRAME_TYPE_OVERLAY;
+ case LF_UPDATE: return ENCODE_FRAME_TYPE_INTER;
+ default:
+ fprintf(stderr, "Unsupported update_type %d\n", update_type);
+ abort();
+ return ENCODE_FRAME_TYPE_INTER;
+ }
+}
+
+typedef struct RATE_QSTEP_MODEL {
+ // The rq model predicts the bit usage as follows.
+ // rate = bias - ratio * log2(q_step)
+ int ready;
+ double bias;
+ double ratio;
+} RATE_QSTEP_MODEL;
+
+typedef struct ENCODE_COMMAND {
+ int use_external_quantize_index;
+ int external_quantize_index;
+
+ int use_external_target_frame_bits;
+ int target_frame_bits;
+ double target_frame_bits_error_percent;
+
+ GOP_COMMAND gop_command;
+} ENCODE_COMMAND;
+
+static INLINE void encode_command_set_gop_command(
+ ENCODE_COMMAND *encode_command, GOP_COMMAND gop_command) {
+ encode_command->gop_command = gop_command;
+}
+
+static INLINE void encode_command_set_external_quantize_index(
+ ENCODE_COMMAND *encode_command, int quantize_index) {
+ encode_command->use_external_quantize_index = 1;
+ encode_command->external_quantize_index = quantize_index;
+}
+
+static INLINE void encode_command_reset_external_quantize_index(
+ ENCODE_COMMAND *encode_command) {
+ encode_command->use_external_quantize_index = 0;
+ encode_command->external_quantize_index = -1;
+}
+
+static INLINE void encode_command_set_target_frame_bits(
+ ENCODE_COMMAND *encode_command, int target_frame_bits,
+ double target_frame_bits_error_percent) {
+ encode_command->use_external_target_frame_bits = 1;
+ encode_command->target_frame_bits = target_frame_bits;
+ encode_command->target_frame_bits_error_percent =
+ target_frame_bits_error_percent;
+}
+
+static INLINE void encode_command_reset_target_frame_bits(
+ ENCODE_COMMAND *encode_command) {
+ encode_command->use_external_target_frame_bits = 0;
+ encode_command->target_frame_bits = -1;
+ encode_command->target_frame_bits_error_percent = 0;
+}
+
+static INLINE void encode_command_init(ENCODE_COMMAND *encode_command) {
+ vp9_zero(*encode_command);
+ encode_command_reset_external_quantize_index(encode_command);
+ encode_command_reset_target_frame_bits(encode_command);
+ gop_command_off(&encode_command->gop_command);
+}
+
+// Returns number of units in size of 4, if not multiple not a multiple of 4,
+// round it up. For example, size is 7, return 2.
+static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
+// Returns number of units in size of 16, if not multiple not a multiple of 16,
+// round it up. For example, size is 17, return 2.
+static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
+#endif // CONFIG_RATE_CTRL
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+ vp9_get_compressed_data_time,
+ vp9_temporal_filter_time,
+ vp9_rc_get_second_pass_params_time,
+ setup_tpl_stats_time,
+ Pass2Encode_time,
+
+ encode_with_recode_loop_time,
+ loopfilter_frame_time,
+ vp9_pack_bitstream_time,
+
+ encode_frame_internal_time,
+ rd_pick_partition_time,
+ rd_pick_sb_modes_time,
+ encode_sb_time,
+
+ vp9_rd_pick_inter_mode_sb_time,
+ vp9_rd_pick_inter_mode_sub8x8_time,
+
+ intra_mode_search_time,
+ handle_inter_mode_time,
+ single_motion_search_time,
+ joint_motion_search_time,
+ interp_filter_time,
+
+ kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+ switch (index) {
+ case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+ case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+ case vp9_rc_get_second_pass_params_time:
+ return "vp9_rc_get_second_pass_params_time";
+ case setup_tpl_stats_time: return "setup_tpl_stats_time";
+ case Pass2Encode_time: return "Pass2Encode_time";
+
+ case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+ case loopfilter_frame_time: return "loopfilter_frame_time";
+ case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+ case encode_frame_internal_time: return "encode_frame_internal_time";
+ case rd_pick_partition_time: return "rd_pick_partition_time";
+ case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+ case encode_sb_time: return "encode_sb_time";
+
+ case vp9_rd_pick_inter_mode_sb_time:
+ return "vp9_rd_pick_inter_mode_sb_time";
+ case vp9_rd_pick_inter_mode_sub8x8_time:
+ return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+ case intra_mode_search_time: return "intra_mode_search_time";
+ case handle_inter_mode_time: return "handle_inter_mode_time";
+ case single_motion_search_time: return "single_motion_search_time";
+ case joint_motion_search_time: return "joint_motion_search_time";
+ case interp_filter_time: return "interp_filter_time";
+
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+typedef struct VP9_COMP {
+ FRAME_INFO frame_info;
+ QUANTS quants;
+ ThreadData td;
+ MB_MODE_INFO_EXT *mbmi_ext_base;
+ DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+ VP9_COMMON common;
+ VP9EncoderConfig oxcf;
+ struct lookahead_ctx *lookahead;
+ struct lookahead_entry *alt_ref_source;
+
+ YV12_BUFFER_CONFIG *Source;
+ YV12_BUFFER_CONFIG *Last_Source; // NULL for first frame and alt_ref frames
+ YV12_BUFFER_CONFIG *un_scaled_source;
+ YV12_BUFFER_CONFIG scaled_source;
+ YV12_BUFFER_CONFIG *unscaled_last_source;
+ YV12_BUFFER_CONFIG scaled_last_source;
+#ifdef ENABLE_KF_DENOISE
+ YV12_BUFFER_CONFIG raw_unscaled_source;
+ YV12_BUFFER_CONFIG raw_scaled_source;
+#endif
+ YV12_BUFFER_CONFIG *raw_source_frame;
+
+ BLOCK_SIZE tpl_bsize;
+ TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+ // Used to store TPL stats before propagation
+ VpxTplGopStats tpl_gop_stats;
+ YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
+ EncFrameBuf enc_frame_buf[REF_FRAMES];
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t kmeans_mutex;
+#endif
+ int kmeans_data_arr_alloc;
+ KMEANS_DATA *kmeans_data_arr;
+ int kmeans_data_size;
+ int kmeans_data_stride;
+ double kmeans_ctr_ls[MAX_KMEANS_GROUPS];
+ double kmeans_boundary_ls[MAX_KMEANS_GROUPS];
+ int kmeans_count_ls[MAX_KMEANS_GROUPS];
+ int kmeans_ctr_num;
+#if CONFIG_NON_GREEDY_MV
+ MotionFieldInfo motion_field_info;
+ int tpl_ready;
+ int_mv *select_mv_arr;
+#endif
+
+ TileDataEnc *tile_data;
+ int allocated_tiles; // Keep track of memory allocated for tiles.
+
+ int scaled_ref_idx[REFS_PER_FRAME];
+ int lst_fb_idx;
+ int gld_fb_idx;
+ int alt_fb_idx;
+
+ int ref_fb_idx[REF_FRAMES];
+
+ int refresh_last_frame;
+ int refresh_golden_frame;
+ int refresh_alt_ref_frame;
+
+ int ext_refresh_frame_flags_pending;
+ int ext_refresh_last_frame;
+ int ext_refresh_golden_frame;
+ int ext_refresh_alt_ref_frame;
+
+ int ext_refresh_frame_context_pending;
+ int ext_refresh_frame_context;
+
+ int64_t norm_wiener_variance;
+ int64_t *mb_wiener_variance;
+ int mb_wiener_var_rows;
+ int mb_wiener_var_cols;
+ double *mi_ssim_rdmult_scaling_factors;
+
+ YV12_BUFFER_CONFIG last_frame_uf;
+
+ TOKENEXTRA *tile_tok[4][1 << 6];
+ TOKENLIST *tplist[4][1 << 6];
+
+ // Ambient reconstruction err target for force key frames
+ int64_t ambient_err;
+
+ RD_CONTROL rd_ctrl;
+ RD_OPT rd;
+
+ CODING_CONTEXT coding_context;
+
+ int *nmvcosts[2];
+ int *nmvcosts_hp[2];
+ int *nmvsadcosts[2];
+ int *nmvsadcosts_hp[2];
+
+ int64_t last_time_stamp_seen;
+ int64_t last_end_time_stamp_seen;
+ int64_t first_time_stamp_ever;
+
+ RATE_CONTROL rc;
+ double framerate;
+
+ int interp_filter_selected[REF_FRAMES][SWITCHABLE];
+
+ struct vpx_codec_pkt_list *output_pkt_list;
+
+ MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+ int mbgraph_n_frames; // number of frames filled in the above
+ int static_mb_pct; // % forced skip mbs by segmentation
+ int ref_frame_flags;
+
+ SPEED_FEATURES sf;
+
+ uint32_t max_mv_magnitude;
+ int mv_step_param;
+
+ int allow_comp_inter_inter;
+
+ // Default value is 1. From first pass stats, encode_breakout may be disabled.
+ ENCODE_BREAKOUT_TYPE allow_encode_breakout;
+
+ // Get threshold from external input. A suggested threshold is 800 for HD
+ // clips, and 300 for < HD clips.
+ int encode_breakout;
+
+ uint8_t *segmentation_map;
+
+ uint8_t *skin_map;
+
+ // segment threashold for encode breakout
+ int segment_encode_breakout[MAX_SEGMENTS];
+
+ CYCLIC_REFRESH *cyclic_refresh;
+ ActiveMap active_map;
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ struct scale_factors me_sf;
+ vp9_diamond_search_fn_t diamond_search_sad;
+ vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+ uint64_t time_receive_data;
+ uint64_t time_compress_data;
+ uint64_t time_pick_lpf;
+ uint64_t time_encode_sb_row;
+
+ TWO_PASS twopass;
+
+ // Force recalculation of segment_ids for each mode info
+ uint8_t force_update_segmentation;
+
+ YV12_BUFFER_CONFIG alt_ref_buffer;
+
+ // class responsible for adaptive
+ // quantization of altref frames
+ struct ALT_REF_AQ *alt_ref_aq;
+
+#if CONFIG_INTERNAL_STATS
+ unsigned int mode_chosen_counts[MAX_MODES];
+
+ int count;
+ uint64_t total_sq_error;
+ uint64_t total_samples;
+ ImageStat psnr;
+
+ uint64_t totalp_sq_error;
+ uint64_t totalp_samples;
+ ImageStat psnrp;
+
+ double total_blockiness;
+ double worst_blockiness;
+
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ double summedp_quality;
+ double summedp_weights;
+ unsigned int tot_recode_hits;
+ double worst_ssim;
+
+ ImageStat ssimg;
+ ImageStat fastssim;
+ ImageStat psnrhvs;
+
+ int b_calculate_ssimg;
+ int b_calculate_blockiness;
+
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
+#endif
+ int b_calculate_psnr;
+
+ int droppable;
+
+ int initial_width;
+ int initial_height;
+ int initial_mbs; // Number of MBs in the full-size frame; to be used to
+ // normalize the firstpass stats. This will differ from the
+ // number of MBs in the current frame when the frame is
+ // scaled.
+
+ int use_svc;
+
+ SVC svc;
+
+ // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
+ Diff *source_diff_var;
+ // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
+ unsigned int source_var_thresh;
+ int frames_till_next_var_check;
+
+ int frame_flags;
+
+ search_site_config ss_cfg;
+
+ int mbmode_cost[INTRA_MODES];
+ unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+ int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES][INTRA_MODES];
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+ // Indices are: max_tx_size-1, tx_size_ctx, tx_size
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ VP9_DENOISER denoiser;
+#endif
+
+ int resize_pending;
+ RESIZE_STATE resize_state;
+ int external_resize;
+ int resize_scale_num;
+ int resize_scale_den;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
+
+ int use_skin_detection;
+
+ int target_level;
+
+ NOISE_ESTIMATE noise_estimate;
+
+ // Count on how many consecutive times a block uses small/zeromv for encoding.
+ uint8_t *consec_zero_mv;
+
+ // VAR_BASED_PARTITION thresholds
+ // 0 - threshold_64x64; 1 - threshold_32x32;
+ // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+ int64_t vbp_thresholds[4];
+ int64_t vbp_threshold_minmax;
+ int64_t vbp_threshold_sad;
+ // Threshold used for partition copy
+ int64_t vbp_threshold_copy;
+ BLOCK_SIZE vbp_bsize_min;
+
+ // Multi-threading
+ int num_workers;
+ VPxWorker *workers;
+ struct EncWorkerData *tile_thr_data;
+ VP9LfSync lf_row_sync;
+ struct VP9BitstreamWorkerData *vp9_bitstream_worker_data;
+
+ int keep_level_stats;
+ Vp9LevelInfo level_info;
+ MultiThreadHandle multi_thread_ctxt;
+ void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
+ void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
+ ARNRFilterData arnr_filter_data;
+
+ int row_mt;
+ unsigned int row_mt_bit_exact;
+
+ // Previous Partition Info
+ BLOCK_SIZE *prev_partition;
+ int8_t *prev_segment_id;
+ // Used to save the status of whether a block has a low variance in
+ // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for
+ // 32x32, 9~24 for 16x16.
+ // This is for the last frame and is copied to the current frame
+ // when partition copy happens.
+ uint8_t *prev_variance_low;
+ uint8_t *copied_frame_cnt;
+ uint8_t max_copied_frame;
+ // If the last frame is dropped, we don't copy partition.
+ uint8_t last_frame_dropped;
+
+ // For each superblock: keeps track of the last time (in frame distance) the
+ // the superblock did not have low source sad.
+ uint8_t *content_state_sb_fd;
+
+ int compute_source_sad_onepass;
+
+ int compute_frame_low_motion_onepass;
+
+ LevelConstraint level_constraint;
+
+ uint8_t *count_arf_frame_usage;
+ uint8_t *count_lastgolden_frame_usage;
+
+ int multi_layer_arf;
+ vpx_roi_map_t roi;
+
+ LOOPFILTER_CONTROL loopfilter_ctrl;
+#if CONFIG_RATE_CTRL
+ ENCODE_COMMAND encode_command;
+ PARTITION_INFO *partition_info;
+ MOTION_VECTOR_INFO *motion_vector_info;
+ MOTION_VECTOR_INFO *fp_motion_vector_info;
+ TplDepStats *tpl_stats_info;
+
+ RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
+#endif
+ EXT_RATECTRL ext_ratectrl;
+
+ int fixed_qp_onepass;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ /*!
+ * component_time[] are initialized to zero while encoder starts.
+ */
+ uint64_t component_time[kTimingComponents];
+ /*!
+ * Stores timing for individual components between calls of start_timing()
+ * and end_timing().
+ */
+ struct vpx_usec_timer component_timer[kTimingComponents];
+ /*!
+ * frame_component_time[] are initialized to zero at beginning of each frame.
+ */
+ uint64_t frame_component_time[kTimingComponents];
+#endif
+} VP9_COMP;
+
+#if CONFIG_RATE_CTRL
+// Allocates memory for the partition information.
+// The unit size is each 4x4 block.
+// Only called once in vp9_create_compressor().
+static INLINE void partition_info_init(struct VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
+ const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
+ CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
+ (PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
+ sizeof(PARTITION_INFO)));
+ memset(cpi->partition_info, 0,
+ unit_width * unit_height * sizeof(PARTITION_INFO));
+}
+
+// Frees memory of the partition information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_partition_info(struct VP9_COMP *cpi) {
+ vpx_free(cpi->partition_info);
+ cpi->partition_info = NULL;
+}
+
+static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) {
+ mv_info->ref_frame[0] = NONE;
+ mv_info->ref_frame[1] = NONE;
+ mv_info->mv[0].as_int = INVALID_MV;
+ mv_info->mv[1].as_int = INVALID_MV;
+}
+
+// Allocates memory for the motion vector information.
+// The unit size is each 4x4 block.
+// Only called once in vp9_create_compressor().
+static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
+ const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
+ CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
+ (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
+ sizeof(MOTION_VECTOR_INFO)));
+ memset(cpi->motion_vector_info, 0,
+ unit_width * unit_height * sizeof(MOTION_VECTOR_INFO));
+}
+
+// Frees memory of the motion vector information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
+ vpx_free(cpi->motion_vector_info);
+ cpi->motion_vector_info = NULL;
+}
+
+// Allocates memory for the tpl stats information.
+// Only called once in vp9_create_compressor().
+static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tpl_stats_info,
+ (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
+ memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
+}
+
+// Frees memory of the tpl stats information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_tpl_stats_info(struct VP9_COMP *cpi) {
+ vpx_free(cpi->tpl_stats_info);
+ cpi->tpl_stats_info = NULL;
+}
+
+// Allocates memory for the first pass motion vector information.
+// The unit size is each 16x16 block.
+// Only called once in vp9_create_compressor().
+static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
+ const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
+ CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
+ (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
+ sizeof(MOTION_VECTOR_INFO)));
+}
+
+static INLINE void fp_motion_vector_info_reset(
+ int frame_width, int frame_height,
+ MOTION_VECTOR_INFO *fp_motion_vector_info) {
+ const int unit_width = get_num_unit_16x16(frame_width);
+ const int unit_height = get_num_unit_16x16(frame_height);
+ int i;
+ for (i = 0; i < unit_width * unit_height; ++i) {
+ reset_mv_info(fp_motion_vector_info + i);
+ }
+}
+
+// Frees memory of the first pass motion vector information.
+// Only called once in dealloc_compressor_data().
+static INLINE void free_fp_motion_vector_info(struct VP9_COMP *cpi) {
+ vpx_free(cpi->fp_motion_vector_info);
+ cpi->fp_motion_vector_info = NULL;
+}
+
+// This is the c-version counter part of ImageBuffer
+typedef struct IMAGE_BUFFER {
+ int allocated;
+ int plane_width[3];
+ int plane_height[3];
+ uint8_t *plane_buffer[3];
+} IMAGE_BUFFER;
+
+#define RATE_CTRL_MAX_RECODE_NUM 7
+
+typedef struct RATE_QINDEX_HISTORY {
+ int recode_count;
+ int q_index_history[RATE_CTRL_MAX_RECODE_NUM];
+ int rate_history[RATE_CTRL_MAX_RECODE_NUM];
+ int q_index_high;
+ int q_index_low;
+} RATE_QINDEX_HISTORY;
+
+#endif // CONFIG_RATE_CTRL
+
+typedef struct ENCODE_FRAME_RESULT {
+ int show_idx;
+ FRAME_UPDATE_TYPE update_type;
+#if CONFIG_RATE_CTRL
+ int frame_coding_index;
+ int ref_frame_coding_indexes[MAX_INTER_REF_FRAMES];
+ int ref_frame_valid_list[MAX_INTER_REF_FRAMES];
+ double psnr;
+ uint64_t sse;
+ FRAME_COUNTS frame_counts;
+ const PARTITION_INFO *partition_info;
+ const MOTION_VECTOR_INFO *motion_vector_info;
+ const TplDepStats *tpl_stats_info;
+ IMAGE_BUFFER coded_frame;
+ RATE_QINDEX_HISTORY rq_history;
+#endif // CONFIG_RATE_CTRL
+ int quantize_index;
+} ENCODE_FRAME_RESULT;
+
+void vp9_init_encode_frame_result(ENCODE_FRAME_RESULT *encode_frame_result);
+
+void vp9_initialize_enc(void);
+
+void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt);
+struct VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
+ BufferPool *const pool);
+void vp9_remove_compressor(VP9_COMP *cpi);
+
+void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time);
+
+int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush,
+ ENCODE_FRAME_RESULT *encode_frame_result);
+
+int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
+ vp9_ppflags_t *flags);
+
+int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags);
+
+int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
+ YV12_BUFFER_CONFIG *sd);
+
+int vp9_update_entropy(VP9_COMP *cpi, int update);
+
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols);
+
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols);
+
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+ VPX_SCALING_MODE vert_mode);
+
+int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
+ unsigned int height);
+
+void vp9_set_svc(VP9_COMP *cpi, int use_svc);
+
+// Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+// configuration change has a large change in avg_frame_bandwidth.
+// For SVC check for resetting based on spatial layer average bandwidth.
+// Also reset buffer level to optimal level.
+void vp9_check_reset_rc_flag(VP9_COMP *cpi);
+
+void vp9_set_rc_buffer_sizes(VP9_COMP *cpi);
+
+static INLINE int stack_pop(int *stack, int stack_size) {
+ int idx;
+ const int r = stack[0];
+ for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx];
+
+ return r;
+}
+
+static INLINE int stack_top(const int *stack) { return stack[0]; }
+
+static INLINE void stack_push(int *stack, int new_item, int stack_size) {
+ int idx;
+ for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1];
+ stack[0] = new_item;
+}
+
+static INLINE void stack_init(int *stack, int length) {
+ int idx;
+ for (idx = 0; idx < length; ++idx) stack[idx] = -1;
+}
+
+int vp9_get_quantizer(const VP9_COMP *cpi);
+
+static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+ static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+ return kVp9RefFlagList[ref_frame];
+}
+
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+ if (ref_frame == LAST_FRAME) {
+ return cpi->lst_fb_idx;
+ } else if (ref_frame == GOLDEN_FRAME) {
+ return cpi->gld_fb_idx;
+ } else {
+ return cpi->alt_fb_idx;
+ }
+}
+
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+ int ref_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_cnt_buffer(const VP9_COMMON *cm,
+ int fb_idx) {
+ return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL;
+}
+
+static INLINE void get_ref_frame_bufs(
+ const VP9_COMP *cpi, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES]) {
+ const VP9_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+ int ref_frame_buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ int inter_ref_idx = mv_ref_frame_to_inter_ref_idx(ref_frame);
+ ref_frame_bufs[inter_ref_idx] = get_ref_cnt_buffer(cm, ref_frame_buf_idx);
+ }
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+ const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
+ : NULL;
+}
+
+static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+ // TODO(JBB): double check we can't exceed this token count if we have a
+ // 32x32 transform crossing a boundary at a multiple of 16.
+ // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
+ // resolution. We assume up to 1 token per pixel, and then allow
+ // a head room of 4.
+ return mb_rows * mb_cols * (16 * 16 * 3 + 4);
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE int allocated_tokens(TileInfo tile) {
+ int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
+ int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
+
+ return get_token_alloc(tile_mb_rows, tile_mb_cols);
+}
+
+static INLINE void get_start_tok(VP9_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TOKENEXTRA **tok) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 1) >> 1;
+ const int mb_row = (mi_row - tile_info->mi_row_start) >> 1;
+
+ *tok =
+ cpi->tile_tok[tile_row][tile_col] + get_token_alloc(mb_row, tile_mb_cols);
+}
+
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_scale_references(VP9_COMP *cpi);
+
+void vp9_update_reference_frames(VP9_COMP *cpi);
+
+void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
+ int *ref_frame_coding_indexes,
+ int *ref_frame_valid_list);
+
+void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
+
+YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
+ int phase_scaler, INTERP_FILTER filter_type2, int phase_scaler2);
+
+YV12_BUFFER_CONFIG *vp9_scale_if_required(
+ VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
+ int use_normative_scaler, INTERP_FILTER filter_type, int phase_scaler);
+
+void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
+
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
+ return (cpi->use_svc && cpi->oxcf.pass == 0);
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
+ return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+ cpi->svc.first_layer_denoise));
+}
+#endif
+
+#define MIN_LOOKAHEAD_FOR_ARFS 4
+static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
+ return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
+ cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
+ cpi->oxcf.enable_auto_arf;
+}
+
+static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0,
+ MV_REFERENCE_FRAME ref1) {
+ xd->block_refs[0] =
+ &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
+ xd->block_refs[1] =
+ &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+}
+
+static INLINE int get_chessboard_index(const int frame_index) {
+ return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
+ return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+static INLINE int get_num_vert_units(TileInfo tile, int shift) {
+ int num_vert_units =
+ (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
+ return num_vert_units;
+}
+
+static INLINE int get_num_cols(TileInfo tile, int shift) {
+ int num_cols =
+ (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
+ return num_cols;
+}
+
+static INLINE int get_level_index(VP9_LEVEL level) {
+ int i;
+ for (i = 0; i < VP9_LEVELS; ++i) {
+ if (level == vp9_level_defs[i].level) return i;
+ }
+ return -1;
+}
+
+// Return the log2 value of max column tiles corresponding to the level that
+// the picture size fits into.
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+ uint32_t height) {
+ int i;
+ const uint32_t pic_size = width * height;
+ const uint32_t pic_breadth = VPXMAX(width, height);
+ for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+ if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+ vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+ return get_msb(vp9_level_defs[i].max_col_tiles);
+ }
+ }
+ return INT_MAX;
+}
+
+VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
+
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+ unsigned int rows, unsigned int cols,
+ int delta_q[8], int delta_lf[8], int skip[8],
+ int ref_frame[8]);
+
+void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
+void vp9_set_row_mt(VP9_COMP *cpi);
+
+int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+ RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+ if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+ new_fb_ptr->mi_cols < cm->mi_cols) {
+ vpx_free(new_fb_ptr->mvs);
+ CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
+ (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*new_fb_ptr->mvs)));
+ new_fb_ptr->mi_rows = cm->mi_rows;
+ new_fb_ptr->mi_cols = cm->mi_cols;
+ }
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+ assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+ return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+ const MV *ref, int sad_per_bit) {
+ MV diff;
+ diff.row = mv->row - ref->row;
+ diff.col = mv->col - ref->col;
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+ VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+ const MV *ref_mv_full,
+ vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+ const int src_buf_stride = x->plane[0].src.stride;
+ const uint8_t *const src_buf = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pred_buf_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *const pred_buf =
+ xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+ uint32_t start_mv_sad =
+ sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+ start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+ return start_mv_sad;
+}
+
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+ int subsampling_dim, int blk_dim) {
+ return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+ const struct macroblockd_plane *const pd,
+ const int16_t *diff, const int diff_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize,
+ int *visible_width, int *visible_height) {
+ int64_t sse;
+ const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+ const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+ const int b4x4s_to_right_edge = num_4x4_to_edge(
+ plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+ const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+ plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+ if (tx_bsize == BLOCK_4X4 ||
+ (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+ assert(tx_4x4_w == tx_4x4_h);
+ sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+ *visible_width = tx_4x4_w << 2;
+ *visible_height = tx_4x4_h << 2;
+ } else {
+ int r, c;
+ const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+ const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+ sse = 0;
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_r; ++r) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_c; ++c) {
+ sse += (int64_t)vpx_sum_squares_2d_i16(
+ diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+ }
+ }
+ *visible_width = max_c << 2;
+ *visible_height = max_r << 2;
+ }
+ return sse;
+}
+
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+ const int16_t *src_diff, int diff_stride,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ const struct encode_b_args *const args = (struct encode_b_args *)arg;
+ const MACROBLOCK *const x = args->x;
+
+ switch (args->enable_trellis_opt) {
+ case DISABLE_TRELLIS_OPT: return 0;
+ case ENABLE_TRELLIS_OPT: return 1;
+ case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+ vpx_clear_system_state();
+
+ return (args->trellis_opt_thresh > 0.0)
+ ? (x->log_block_src_var <= args->trellis_opt_thresh)
+ : 1;
+ }
+ case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+ const int dequant_shift = 3;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ const int qstep = pd->dequant[1] >> dequant_shift;
+ int *sse_calc_done = args->sse_calc_done;
+ int64_t *sse = args->sse;
+ int visible_width = 0, visible_height = 0;
+
+ // TODO: Enable the sf for high bit-depth case
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+ !sse_calc_done)
+ return 1;
+
+ *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+ blk_col, plane_bsize, tx_bsize, &visible_width,
+ &visible_height);
+ *sse_calc_done = 1;
+
+ vpx_clear_system_state();
+
+ return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+ qstep * args->trellis_opt_thresh);
+ }
+ default: assert(0 && "Invalid trellis optimization method."); return 1;
+ }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+ vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+ vpx_usec_timer_mark(&cpi->component_timer[component]);
+ cpi->frame_component_time[component] +=
+ vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+ switch (type) {
+ case 0: return "KEY_FRAME";
+ case 1: return "INTER_FRAME";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ENCODER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
new file mode 100644
index 0000000000..fadd233899
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.c
@@ -0,0 +1,689 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/encoder/vp9_bitstream.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ int i, j, k, l, m, n;
+
+ for (i = 0; i < REFERENCE_MODES; i++)
+ td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+ for (i = 0; i < TX_SIZES; i++)
+ for (j = 0; j < PLANE_TYPES; j++)
+ for (k = 0; k < REF_TYPES; k++)
+ for (l = 0; l < COEF_BANDS; l++)
+ for (m = 0; m < COEFF_CONTEXTS; m++)
+ for (n = 0; n < ENTROPY_TOKENS; n++)
+ td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+ td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int t;
+
+ (void)unused;
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ return 0;
+}
+
+static int get_max_tile_cols(VP9_COMP *cpi) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(cpi->oxcf.width, MI_SIZE_LOG2);
+ int mi_cols = aligned_width >> MI_SIZE_LOG2;
+ int min_log2_tile_cols, max_log2_tile_cols;
+ int log2_tile_cols;
+
+ vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
+ log2_tile_cols =
+ clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+ if (cpi->oxcf.target_level == LEVEL_AUTO) {
+ const int level_tile_cols =
+ log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+ if (log2_tile_cols > level_tile_cols) {
+ log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+ }
+ }
+ return (1 << log2_tile_cols);
+}
+
+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
+ VP9_COMMON *const cm = &cpi->common;
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ int i;
+ // While using SVC, we need to allocate threads according to the highest
+ // resolution. When row based multithreading is enabled, it is OK to
+ // allocate more threads than the number of max tile columns.
+ if (cpi->use_svc && !cpi->row_mt) {
+ int max_tile_cols = get_max_tile_cols(cpi);
+ num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+ }
+ assert(num_workers > 0);
+ if (num_workers == cpi->num_workers) return;
+ vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+ vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+ vp9_encode_free_mt_data(cpi);
+
+ CHECK_MEM_ERROR(&cm->error, cpi->workers,
+ vpx_malloc(num_workers * sizeof(*cpi->workers)));
+
+ CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
+ vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ if (i < num_workers - 1) {
+ thread_data->cpi = cpi;
+
+ // Allocate thread data.
+ CHECK_MEM_ERROR(&cm->error, thread_data->td,
+ vpx_memalign(32, sizeof(*thread_data->td)));
+ vp9_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->leaf_tree = NULL;
+ thread_data->td->pc_tree = NULL;
+ vp9_setup_pc_tree(cm, thread_data->td);
+
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
+ vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Create threads
+ if (!winterface->reset(worker))
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->cpi = cpi;
+ thread_data->td = &cpi->td;
+ }
+ winterface->sync(worker);
+ }
+}
+
+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
+ int num_workers) {
+ const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+ int i;
+
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ worker->hook = hook;
+ worker->data1 = &cpi->tile_thr_data[i];
+ worker->data2 = data2;
+ }
+
+ // Encode a frame
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+
+ // Encoding ends.
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+}
+
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
+ int t;
+ for (t = 0; t < cpi->num_workers; ++t) {
+ VPxWorker *const worker = &cpi->workers[t];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+ // Deallocate allocated threads.
+ vpx_get_worker_interface()->end(worker);
+
+ // Deallocate allocated thread data.
+ if (t < cpi->num_workers - 1) {
+ vpx_free(thread_data->td->counts);
+ vp9_free_pc_tree(thread_data->td);
+ vpx_free(thread_data->td);
+ }
+ }
+ vpx_free(cpi->tile_thr_data);
+ vpx_free(cpi->workers);
+ cpi->num_workers = 0;
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+ int i;
+
+ vp9_init_tile_data(cpi);
+
+ create_enc_workers(cpi, num_workers);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ }
+ if (thread_data->td->counts != &cpi->common.counts) {
+ memcpy(thread_data->td->counts, &cpi->common.counts,
+ sizeof(cpi->common.counts));
+ }
+
+ // Handle use_nonrd_pick_mode case.
+ if (cpi->sf.use_nonrd_pick_mode) {
+ MACROBLOCK *const x = &thread_data->td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+ int j;
+
+ for (j = 0; j < MAX_MB_PLANE; ++j) {
+ p[j].coeff = ctx->coeff_pbuf[j][0];
+ p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+ pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+ p[j].eobs = ctx->eobs_pbuf[j][0];
+ }
+ }
+ }
+
+ launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers);
+
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
+ TileDataEnc *tile_data_t) {
+ tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
+ tile_data->fp_data.brightness_factor +=
+ tile_data_t->fp_data.brightness_factor;
+ tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
+ tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
+ tile_data->fp_data.frame_noise_energy +=
+ tile_data_t->fp_data.frame_noise_energy;
+ tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
+ tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
+ tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
+ tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
+ tile_data->fp_data.intra_count_low += tile_data_t->fp_data.intra_count_low;
+ tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
+ tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
+ tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+ tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
+ tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
+ tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
+ tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
+ tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
+ tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
+ tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
+ tile_data->fp_data.intra_smooth_count +=
+ tile_data_t->fp_data.intra_smooth_count;
+ tile_data->fp_data.image_data_start_row =
+ VPXMIN(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row) == INVALID_ROW
+ ? VPXMAX(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row)
+ : VPXMIN(tile_data->fp_data.image_data_start_row,
+ tile_data_t->fp_data.image_data_start_row);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Allocate memory for row synchronization
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
+ int rows) {
+ row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
+ vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
+ if (row_mt_sync->mutex) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&row_mt_sync->mutex[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
+ vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
+ if (row_mt_sync->cond) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&row_mt_sync->cond[i], NULL);
+ }
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
+ vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+ // Set up nsync.
+ row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
+ if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+
+ if (row_mt_sync->mutex != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_mutex_destroy(&row_mt_sync->mutex[i]);
+ }
+ vpx_free(row_mt_sync->mutex);
+ }
+ if (row_mt_sync->cond != NULL) {
+ for (i = 0; i < row_mt_sync->rows; ++i) {
+ pthread_cond_destroy(&row_mt_sync->cond[i]);
+ }
+ vpx_free(row_mt_sync->cond);
+ }
+#endif // CONFIG_MULTITHREAD
+ vpx_free(row_mt_sync->cur_col);
+ // clear the structure as the source of this call may be dynamic change
+ // in tiles in which case this call will be followed by an _alloc()
+ // which may fail.
+ vp9_zero(*row_mt_sync);
+ }
+}
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) {
+ pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ return;
+}
+
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = row_mt_sync->sync_range;
+ int cur;
+ // Only signal when there are enough encoded blocks for next row to run.
+ int sig = 1;
+
+ if (c < cols - 1) {
+ cur = c;
+ if (c % nsync != nsync - 1) sig = 0;
+ } else {
+ cur = cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&row_mt_sync->mutex[r]);
+
+ row_mt_sync->cur_col[r] = cur;
+
+ pthread_cond_signal(&row_mt_sync->cond[r]);
+ pthread_mutex_unlock(&row_mt_sync->mutex[r]);
+ }
+#else
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols) {
+ (void)row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)cols;
+ return;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int first_pass_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int tile_row, tile_col;
+ TileDataEnc *this_tile;
+ int end_of_frame;
+ int thread_id = thread_data->thread_id;
+ int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+ JobNode *proc_job = NULL;
+ FIRSTPASS_DATA fp_acc_data;
+ MV zero_mv = { 0, 0 };
+ MV best_ref_mv;
+ int mb_row;
+
+ end_of_frame = 0;
+ while (0 == end_of_frame) {
+ // Get the next job in the queue
+ proc_job =
+ (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+ if (NULL == proc_job) {
+ // Query for the status of other tiles
+ end_of_frame = vp9_get_tiles_proc_status(
+ multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+ tile_cols);
+ } else {
+ tile_col = proc_job->tile_col_id;
+ tile_row = proc_job->tile_row_id;
+
+ this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ mb_row = proc_job->vert_unit_row_num;
+
+ best_ref_mv = zero_mv;
+ vp9_zero(fp_acc_data);
+ fp_acc_data.image_data_start_row = INVALID_ROW;
+ vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
+ this_tile, &best_ref_mv, mb_row);
+ }
+ }
+ return 0;
+}
+
+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ TileDataEnc *first_tile_col;
+ int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+ int i;
+
+ if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+ multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+ multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_init_tile_data(cpi);
+ vp9_row_mt_mem_alloc(cpi);
+ } else {
+ vp9_init_tile_data(cpi);
+ }
+
+ create_enc_workers(cpi, num_workers);
+
+ vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+ vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
+
+ vp9_multi_thread_tile_init(cpi);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ }
+ }
+
+ launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
+ num_workers);
+
+ first_tile_col = &cpi->tile_data[0];
+ for (i = 1; i < tile_cols; i++) {
+ TileDataEnc *this_tile = &cpi->tile_data[i];
+ accumulate_fp_tile_stat(first_tile_col, this_tile);
+ }
+}
+
+static int temporal_filter_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int tile_row, tile_col;
+ int mb_col_start, mb_col_end;
+ TileDataEnc *this_tile;
+ int end_of_frame;
+ int thread_id = thread_data->thread_id;
+ int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+ JobNode *proc_job = NULL;
+ int mb_row;
+
+ end_of_frame = 0;
+ while (0 == end_of_frame) {
+ // Get the next job in the queue
+ proc_job =
+ (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+ if (NULL == proc_job) {
+ // Query for the status of other tiles
+ end_of_frame = vp9_get_tiles_proc_status(
+ multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+ tile_cols);
+ } else {
+ tile_col = proc_job->tile_col_id;
+ tile_row = proc_job->tile_row_id;
+ this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT;
+ mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT;
+ mb_row = proc_job->vert_unit_row_num;
+
+ vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row,
+ mb_col_start, mb_col_end);
+ }
+ }
+ return 0;
+}
+
+void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int num_workers = cpi->num_workers ? cpi->num_workers : 1;
+ int i;
+
+ if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+ multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+ multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_init_tile_data(cpi);
+ vp9_row_mt_mem_alloc(cpi);
+ } else {
+ vp9_init_tile_data(cpi);
+ }
+
+ create_enc_workers(cpi, num_workers);
+
+ vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+ vp9_prepare_job_queue(cpi, ARNR_JOB);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ }
+ }
+
+ launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
+ num_workers);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
+ VP9_COMP *const cpi = thread_data->cpi;
+ const VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int tile_row, tile_col;
+ int end_of_frame;
+ int thread_id = thread_data->thread_id;
+ int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+ JobNode *proc_job = NULL;
+ int mi_row;
+
+ end_of_frame = 0;
+ while (0 == end_of_frame) {
+ // Get the next job in the queue
+ proc_job =
+ (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+ if (NULL == proc_job) {
+ // Query for the status of other tiles
+ end_of_frame = vp9_get_tiles_proc_status(
+ multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+ tile_cols);
+ } else {
+ tile_col = proc_job->tile_col_id;
+ tile_row = proc_job->tile_row_id;
+ mi_row = proc_job->vert_unit_row_num * MI_BLOCK_SIZE;
+
+ vp9_encode_sb_row(cpi, thread_data->td, tile_row, tile_col, mi_row);
+ }
+ }
+ return 0;
+}
+
+void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+ int i;
+
+ if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+ multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+ multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+ vp9_row_mt_mem_dealloc(cpi);
+ vp9_init_tile_data(cpi);
+ vp9_row_mt_mem_alloc(cpi);
+ } else {
+ vp9_init_tile_data(cpi);
+ }
+
+ create_enc_workers(cpi, num_workers);
+
+ vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+ vp9_prepare_job_queue(cpi, ENCODE_JOB);
+
+ vp9_multi_thread_tile_init(cpi);
+
+ for (i = 0; i < num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ }
+ if (thread_data->td->counts != &cpi->common.counts) {
+ memcpy(thread_data->td->counts, &cpi->common.counts,
+ sizeof(cpi->common.counts));
+ }
+
+ // Handle use_nonrd_pick_mode case.
+ if (cpi->sf.use_nonrd_pick_mode) {
+ MACROBLOCK *const x = &thread_data->td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+ int j;
+
+ for (j = 0; j < MAX_MB_PLANE; ++j) {
+ p[j].coeff = ctx->coeff_pbuf[j][0];
+ p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+ pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+ p[j].eobs = ctx->eobs_pbuf[j][0];
+ }
+ }
+ }
+
+ launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
+ num_workers);
+
+ for (i = 0; i < num_workers; i++) {
+ VPxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
new file mode 100644
index 0000000000..4c192da515
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ethread.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
+#define VPX_VP9_ENCODER_VP9_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NUM_TILE_COLS (1 << 6)
+#define MAX_NUM_TILE_ROWS 4
+#define MAX_NUM_THREADS 80
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct VP9_COMP *cpi;
+ struct ThreadData *td;
+ int start;
+ int thread_id;
+ int tile_completion_status[MAX_NUM_TILE_COLS];
+} EncWorkerData;
+
+// Encoder row synchronization
+typedef struct VP9RowMTSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex;
+ pthread_cond_t *cond;
+#endif
+ // Allocate memory to store the sb/mb block index in each row.
+ int *cur_col;
+ int sync_range;
+ int rows;
+} VP9RowMTSync;
+
+// Frees EncWorkerData related allocations made by vp9_encode_*_mt().
+// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not
+// called by this function.
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi);
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
+
+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols);
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+ const int cols);
+
+// Allocate memory for row based multi-threading synchronization.
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
+ int rows);
+
+// Deallocate row based multi-threading synchronization related mutex and data.
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
+
+void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
new file mode 100644
index 0000000000..1d440442b5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_ext_ratectrl.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/psnr.h"
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ vp9_zero(*ext_ratectrl);
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+ vpx_rc_config_t ratectrl_config,
+ EXT_RATECTRL *ext_ratectrl) {
+ vpx_rc_status_t rc_status;
+ vpx_rc_firstpass_stats_t *rc_firstpass_stats;
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ vp9_extrc_delete(ext_ratectrl);
+ ext_ratectrl->funcs = funcs;
+ ext_ratectrl->ratectrl_config = ratectrl_config;
+ rc_status = ext_ratectrl->funcs.create_model(ext_ratectrl->funcs.priv,
+ &ext_ratectrl->ratectrl_config,
+ &ext_ratectrl->model);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ rc_firstpass_stats = &ext_ratectrl->rc_firstpass_stats;
+ rc_firstpass_stats->num_frames = ratectrl_config.show_frame_count;
+ rc_firstpass_stats->frame_stats =
+ vpx_malloc(sizeof(*rc_firstpass_stats->frame_stats) *
+ rc_firstpass_stats->num_frames);
+ if (rc_firstpass_stats->frame_stats == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ ext_ratectrl->ready = 1;
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ if (ext_ratectrl->ready) {
+ vpx_rc_status_t rc_status =
+ ext_ratectrl->funcs.delete_model(ext_ratectrl->model);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ vpx_free(ext_ratectrl->rc_firstpass_stats.frame_stats);
+ }
+ return vp9_extrc_init(ext_ratectrl);
+}
+
+static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
+ vpx_rc_frame_stats_t *rc_frame_stats) {
+ rc_frame_stats->frame = stats->frame;
+ rc_frame_stats->weight = stats->weight;
+ rc_frame_stats->intra_error = stats->intra_error;
+ rc_frame_stats->coded_error = stats->coded_error;
+ rc_frame_stats->sr_coded_error = stats->sr_coded_error;
+ rc_frame_stats->frame_noise_energy = stats->frame_noise_energy;
+ rc_frame_stats->pcnt_inter = stats->pcnt_inter;
+ rc_frame_stats->pcnt_motion = stats->pcnt_motion;
+ rc_frame_stats->pcnt_second_ref = stats->pcnt_second_ref;
+ rc_frame_stats->pcnt_neutral = stats->pcnt_neutral;
+ rc_frame_stats->pcnt_intra_low = stats->pcnt_intra_low;
+ rc_frame_stats->pcnt_intra_high = stats->pcnt_intra_high;
+ rc_frame_stats->intra_skip_pct = stats->intra_skip_pct;
+ rc_frame_stats->intra_smooth_pct = stats->intra_smooth_pct;
+ rc_frame_stats->inactive_zone_rows = stats->inactive_zone_rows;
+ rc_frame_stats->inactive_zone_cols = stats->inactive_zone_cols;
+ rc_frame_stats->MVr = stats->MVr;
+ rc_frame_stats->mvr_abs = stats->mvr_abs;
+ rc_frame_stats->MVc = stats->MVc;
+ rc_frame_stats->mvc_abs = stats->mvc_abs;
+ rc_frame_stats->MVrv = stats->MVrv;
+ rc_frame_stats->MVcv = stats->MVcv;
+ rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
+ rc_frame_stats->duration = stats->duration;
+ rc_frame_stats->count = stats->count;
+}
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+ EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ if (ext_ratectrl->ready) {
+ vpx_rc_status_t rc_status;
+ vpx_rc_firstpass_stats_t *rc_firstpass_stats =
+ &ext_ratectrl->rc_firstpass_stats;
+ int i;
+ assert(rc_firstpass_stats->num_frames == first_pass_info->num_frames);
+ for (i = 0; i < rc_firstpass_stats->num_frames; ++i) {
+ gen_rc_firstpass_stats(&first_pass_info->stats[i],
+ &rc_firstpass_stats->frame_stats[i]);
+ }
+ rc_status = ext_ratectrl->funcs.send_firstpass_stats(ext_ratectrl->model,
+ rc_firstpass_stats);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ }
+ return VPX_CODEC_OK;
+}
+
+static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
+ // TODO(angiebird): Add unit test to make sure this function behaves like
+ // get_frame_type_from_update_type()
+ // TODO(angiebird): Merge this function with get_frame_type_from_update_type()
+ switch (update_type) {
+ case KF_UPDATE: return 0; // kFrameTypeKey;
+ case ARF_UPDATE: return 2; // kFrameTypeAltRef;
+ case GF_UPDATE: return 4; // kFrameTypeGolden;
+ case OVERLAY_UPDATE: return 3; // kFrameTypeOverlay;
+ case LF_UPDATE: return 1; // kFrameTypeInter;
+ default:
+ fprintf(stderr, "Unsupported update_type %d\n", update_type);
+ abort();
+ return 1;
+ }
+}
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ vpx_rc_encodeframe_decision_t *encode_frame_decision) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
+ vpx_rc_status_t rc_status;
+ vpx_rc_encodeframe_info_t encode_frame_info;
+ encode_frame_info.show_index = show_index;
+ encode_frame_info.coding_index = coding_index;
+ encode_frame_info.gop_index = gop_index;
+ encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+ encode_frame_info.gop_size = gop_size;
+ encode_frame_info.use_alt_ref = use_alt_ref;
+
+ vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+ encode_frame_info.ref_frame_coding_indexes,
+ encode_frame_info.ref_frame_valid_list);
+
+ rc_status = ext_ratectrl->funcs.get_encodeframe_decision(
+ ext_ratectrl->model, &encode_frame_info, encode_frame_decision);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ }
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+ EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+ const YV12_BUFFER_CONFIG *source_frame,
+ const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+ uint32_t input_bit_depth, const int actual_encoding_qindex) {
+ if (ext_ratectrl == NULL) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ if (ext_ratectrl->ready) {
+ PSNR_STATS psnr;
+ vpx_rc_status_t rc_status;
+ vpx_rc_encodeframe_result_t encode_frame_result;
+ encode_frame_result.bit_count = bit_count;
+ encode_frame_result.pixel_count =
+ source_frame->y_crop_width * source_frame->y_crop_height +
+ 2 * source_frame->uv_crop_width * source_frame->uv_crop_height;
+ encode_frame_result.actual_encoding_qindex = actual_encoding_qindex;
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_calc_highbd_psnr(source_frame, coded_frame, &psnr, bit_depth,
+ input_bit_depth);
+#else
+ (void)bit_depth;
+ (void)input_bit_depth;
+ vpx_calc_psnr(source_frame, coded_frame, &psnr);
+#endif
+ encode_frame_result.sse = psnr.sse[0];
+ rc_status = ext_ratectrl->funcs.update_encodeframe_result(
+ ext_ratectrl->model, &encode_frame_result);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ }
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+ EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+ vpx_rc_gop_decision_t *gop_decision) {
+ vpx_rc_status_t rc_status;
+ if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+ (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+ gop_info, gop_decision);
+ if (gop_decision->use_alt_ref) {
+ const int arf_constraint =
+ gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+ gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+ if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+ }
+ // TODO(chengchen): Take min and max gf interval from the model
+ // and overwrite libvpx's decision so that we can get rid
+ // of one of the checks here.
+ if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+ gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+ gop_info->max_gf_interval) {
+ return VPX_CODEC_ERROR;
+ }
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ int *rdmult) {
+ vpx_rc_status_t rc_status;
+ vpx_rc_encodeframe_info_t encode_frame_info;
+ if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+ (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+ return VPX_CODEC_INVALID_PARAM;
+ }
+ encode_frame_info.show_index = show_index;
+ encode_frame_info.coding_index = coding_index;
+ encode_frame_info.gop_index = gop_index;
+ encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+ encode_frame_info.gop_size = gop_size;
+ encode_frame_info.use_alt_ref = use_alt_ref;
+
+ vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+ encode_frame_info.ref_frame_coding_indexes,
+ encode_frame_info.ref_frame_valid_list);
+ rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+ &encode_frame_info, rdmult);
+ if (rc_status == VPX_RC_ERROR) {
+ return VPX_CODEC_ERROR;
+ }
+ return VPX_CODEC_OK;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
new file mode 100644
index 0000000000..7c38758833
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
+
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vp9/encoder/vp9_firstpass.h"
+
+typedef struct EXT_RATECTRL {
+ int ready;
+ int ext_rdmult;
+ vpx_rc_model_t model;
+ vpx_rc_funcs_t funcs;
+ vpx_rc_config_t ratectrl_config;
+ vpx_rc_firstpass_stats_t rc_firstpass_stats;
+} EXT_RATECTRL;
+
+vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_create(vpx_rc_funcs_t funcs,
+ vpx_rc_config_t ratectrl_config,
+ EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
+
+vpx_codec_err_t vp9_extrc_send_firstpass_stats(
+ EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
+
+vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ vpx_rc_encodeframe_decision_t *encode_frame_decision);
+
+vpx_codec_err_t vp9_extrc_update_encodeframe_result(
+ EXT_RATECTRL *ext_ratectrl, int64_t bit_count,
+ const YV12_BUFFER_CONFIG *source_frame,
+ const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
+ uint32_t input_bit_depth, const int actual_encoding_qindex);
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+ EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+ vpx_rc_gop_decision_t *gop_decision);
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+ EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+ FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+ RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+ int *rdmult);
+
+#endif // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.c b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
new file mode 100644
index 0000000000..dcb62e8768
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+ uint8_t *dst, int dst_pitch, int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right,
+ int interleave_step) {
+ int i, j, linesize;
+ const int step = interleave_step < 1 ? 1 : interleave_step;
+
+ // copy the left and right most columns out
+ const uint8_t *src_ptr1 = src;
+ const uint8_t *src_ptr2 = src + (w - 1) * step;
+ uint8_t *dst_ptr1 = dst - extend_left;
+ uint8_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ if (step == 1) {
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ } else {
+ for (j = 0; j < w; j++) {
+ dst_ptr1[extend_left + j] = src_ptr1[step * j];
+ }
+ }
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch, int w,
+ int h, int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+ vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+ dst_ptr2 += dst_pitch;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ // Extend src frame in buffer
+ // Altref filtering assumes 16 pixel extension
+ const int et_y = 16;
+ const int el_y = 16;
+ // Motion estimation may use src block variance with the block size up
+ // to 64x64, so the right and bottom need to be extended to 64 multiple
+ // or up to 16, whichever is greater.
+ const int er_y =
+ VPXMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+ src->y_crop_width;
+ const int eb_y =
+ VPXMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+ src->y_crop_height;
+ const int uv_width_subsampling = (src->uv_width != src->y_width);
+ const int uv_height_subsampling = (src->uv_height != src->y_height);
+ const int et_uv = et_y >> uv_height_subsampling;
+ const int el_uv = el_y >> uv_width_subsampling;
+ const int eb_uv = eb_y >> uv_height_subsampling;
+ const int er_uv = er_y >> uv_width_subsampling;
+ // detect nv12 colorspace
+ const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width,
+ src->y_crop_height, et_y, el_y, eb_y, er_y);
+
+ highbd_copy_and_extend_plane(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+
+ highbd_copy_and_extend_plane(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width, src->y_crop_height,
+ et_y, el_y, eb_y, er_y, 1);
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv, chroma_step);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv, chroma_step);
+}
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw) {
+ // If the side is not touching the bounder then don't extend.
+ const int et_y = srcy ? 0 : dst->border;
+ const int el_y = srcx ? 0 : dst->border;
+ const int eb_y = srcy + srch != src->y_height
+ ? 0
+ : dst->border + dst->y_height - src->y_height;
+ const int er_y = srcx + srcw != src->y_width
+ ? 0
+ : dst->border + dst->y_width - src->y_width;
+ const int src_y_offset = srcy * src->y_stride + srcx;
+ const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+ const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+ const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+ const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+ const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+ const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+ const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+ const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+ // detect nv12 colorspace
+ const int chroma_step = src->v_buffer - src->u_buffer == 1 ? 2 : 1;
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+ dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
+ et_y, el_y, eb_y, er_y, 1);
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+ dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+ dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv, chroma_step);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_extend.h b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
new file mode 100644
index 0000000000..4ba7fc95e3
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_extend.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_
+#define VPX_VP9_ENCODER_VP9_EXTEND_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_EXTEND_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
new file mode 100644
index 0000000000..8fdd976816
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c
@@ -0,0 +1,3898 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+#include "vpx_scale/vpx_scale.h"
+#include "vpx_scale/yv12config.h"
+
+#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h" // vp9_setup_dst_planes()
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vpx_dsp/variance.h"
+
+#define OUTPUT_FPF 0
+#define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
+
+#define FIRST_PASS_Q 10.0
+#define NORMAL_BOOST 100
+#define MIN_ARF_GF_BOOST 250
+#define MIN_DECAY_FACTOR 0.01
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+#define LOW_I_THRESH 24000
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define LOW_CODED_ERR_PER_MB 10.0
+#define NCOUNT_FRAME_II_THRESH 6.0
+#define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_FRAME_BOOST 96.0
+
+#ifdef AGGRESSIVE_VBR
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 80.0
+#define MAX_KF_TOT_BOOST 4800
+#else
+#define KF_MIN_FRAME_BOOST 40.0
+#define KF_MAX_FRAME_BOOST 96.0
+#define MAX_KF_TOT_BOOST 5400
+#endif
+
+#define DEFAULT_ZM_FACTOR 0.5
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+#define AV_WQ_FACTOR 4.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+ p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+ if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+ (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+ if (p->stats_in >= p->stats_in_end) return EOF;
+
+ *fps = *p->stats_in;
+ ++p->stats_in;
+ return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats) {
+ (void)stats;
+// TEMP debug code
+#if OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile,
+ "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf"
+ "%12.4lf"
+ "\n",
+ stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+ stats->sr_coded_error, stats->frame_noise_energy, stats->pcnt_inter,
+ stats->pcnt_motion, stats->pcnt_second_ref, stats->pcnt_neutral,
+ stats->pcnt_intra_low, stats->pcnt_intra_high,
+ stats->intra_skip_pct, stats->intra_smooth_pct,
+ stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+ stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+ stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->weight = 0.0;
+ section->intra_error = 0.0;
+ section->coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->frame_noise_energy = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->intra_skip_pct = 0.0;
+ section->intra_smooth_pct = 0.0;
+ section->pcnt_intra_low = 0.0;
+ section->pcnt_intra_high = 0.0;
+ section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+ section->spatial_layer_id = 0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->weight += frame->weight;
+ section->spatial_layer_id = frame->spatial_layer_id;
+ section->intra_error += frame->intra_error;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->frame_noise_energy += frame->frame_noise_energy;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->intra_skip_pct += frame->intra_skip_pct;
+ section->intra_smooth_pct += frame->intra_smooth_pct;
+ section->pcnt_intra_low += frame->pcnt_intra_low;
+ section->pcnt_intra_high += frame->pcnt_intra_high;
+ section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->frame_noise_energy -= frame->frame_noise_energy;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->intra_smooth_pct -= frame->intra_smooth_pct;
+ section->pcnt_intra_low -= frame->pcnt_intra_low;
+ section->pcnt_intra_high -= frame->pcnt_intra_high;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame) {
+ double active_pct;
+
+ active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)frame_info->mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Get the average weighted error for the clip (or corpus)
+static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) {
+ const double av_weight =
+ twopass->total_stats.weight / twopass->total_stats.count;
+
+ if (cpi->oxcf.vbr_corpus_complexity)
+ return av_weight * twopass->mean_mod_score;
+ else
+ return (twopass->total_stats.coded_error * av_weight) /
+ twopass->total_stats.count;
+}
+
+#define ACT_AREA_CORRECTION 0.5
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame,
+ const double av_err) {
+ double modified_score =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_score *= pow(calculate_active_area(&cpi->frame_info, this_frame),
+ ACT_AREA_CORRECTION);
+
+ return modified_score;
+}
+
+static double calc_norm_frame_score(const VP9EncoderConfig *oxcf,
+ const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ double mean_mod_score, double av_err) {
+ double modified_score =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+ const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_score *=
+ pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
+
+ // Normalize to a midpoint score.
+ modified_score /= DOUBLE_DIVIDE_CHECK(mean_mod_score);
+ return fclamp(modified_score, min_score, max_score);
+}
+
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+ const TWO_PASS *twopass,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame,
+ const double av_err) {
+ return calc_norm_frame_score(oxcf, &cpi->frame_info, this_frame,
+ twopass->mean_mod_score, av_err);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const VP9EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->two_pass_vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+void vp9_init_first_pass(VP9_COMP *cpi) {
+ zero_stats(&cpi->twopass.total_stats);
+}
+
+void vp9_end_first_pass(VP9_COMP *cpi) {
+ output_stats(&cpi->twopass.total_stats);
+ cpi->twopass.first_pass_done = 1;
+ vpx_free(cpi->twopass.fp_mb_float_stats);
+ cpi->twopass.fp_mb_float_stats = NULL;
+}
+
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_8X8: return vpx_mse8x8;
+ case BLOCK_16X8: return vpx_mse16x8;
+ case BLOCK_8X16: return vpx_mse8x16;
+ default: return vpx_mse16x16;
+ }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ unsigned int sse;
+ const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8: return vpx_highbd_8_mse8x8;
+ case BLOCK_16X8: return vpx_highbd_8_mse16x8;
+ case BLOCK_8X16: return vpx_highbd_8_mse8x16;
+ default: return vpx_highbd_8_mse16x16;
+ }
+ break;
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8: return vpx_highbd_10_mse8x8;
+ case BLOCK_16X8: return vpx_highbd_10_mse16x8;
+ case BLOCK_8X16: return vpx_highbd_10_mse8x16;
+ default: return vpx_highbd_10_mse16x16;
+ }
+ break;
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8: return vpx_highbd_12_mse8x8;
+ case BLOCK_16X8: return vpx_highbd_12_mse16x8;
+ case BLOCK_8X16: return vpx_highbd_12_mse8x16;
+ default: return vpx_highbd_12_mse16x16;
+ }
+ break;
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const VP9_COMP *cpi) {
+ int sr = 0;
+ const int dim = VPXMIN(cpi->initial_width, cpi->initial_height);
+
+ while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+ return sr;
+}
+
+// Reduce limits to keep the motion search within MV_MAX of ref_mv. Not doing
+// this can be problematic for big videos (8K) and may cause assert failure
+// (or memory violation) in mv_cost. Limits are only modified if they would
+// be non-empty. Returns 1 if limits are non-empty.
+static int intersect_limits_with_mv_max(MvLimits *mv_limits, const MV *ref_mv) {
+ const int row_min =
+ VPXMAX(mv_limits->row_min, (ref_mv->row + 7 - MV_MAX) >> 3);
+ const int row_max =
+ VPXMIN(mv_limits->row_max, (ref_mv->row - 1 + MV_MAX) >> 3);
+ const int col_min =
+ VPXMAX(mv_limits->col_min, (ref_mv->col + 7 - MV_MAX) >> 3);
+ const int col_max =
+ VPXMIN(mv_limits->col_max, (ref_mv->col - 1 + MV_MAX) >> 3);
+ if (row_min > row_max || col_min > col_max) {
+ return 0;
+ }
+ mv_limits->row_min = row_min;
+ mv_limits->row_max = row_max;
+ mv_limits->col_min = col_min;
+ mv_limits->col_max = col_max;
+ return 1;
+}
+
+static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ const MV *ref_mv, MV *best_mv,
+ int *best_motion_err) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV tmp_mv = { 0, 0 };
+ MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int num00, tmp_err, n;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+ const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+ MV center_mv_full = ref_mv_full;
+ unsigned int start_mv_sad;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
+
+ int step_param = 3;
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ const int sr = get_search_range(cpi);
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ step_param += sr;
+ further_steps -= sr;
+
+ if (!intersect_limits_with_mv_max(&x->mv_limits, ref_mv)) {
+ return;
+ }
+
+ // Override the default variance function to use MSE.
+ v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Calculate SAD of the start mv
+ clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+ cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+ sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+ sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
+
+ // Center the initial step/diamond search on best mv.
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+ &tmp_mv, step_param, x->sadperbit16, &num00,
+ &sad_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+
+ // Carry out further step/diamond searches as necessary.
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ --num00;
+ } else {
+ tmp_err = cpi->diamond_search_sad(
+ x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+ x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty)
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+ }
+ }
+ x->mv_limits = tmp_mv_limits;
+}
+
+static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) {
+ if (2 * mb_col + 1 < cm->mi_cols) {
+ return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_16X16 : BLOCK_16X8;
+ } else {
+ return 2 * mb_row + 1 < cm->mi_rows ? BLOCK_8X16 : BLOCK_8X8;
+ }
+}
+
+static int find_fp_qindex(vpx_bit_depth_t bit_depth) {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; ++i)
+ if (vp9_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
+
+ if (i == QINDEX_RANGE) i--;
+
+ return i;
+}
+
+static void set_first_pass_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+}
+
+// Scale an sse threshold to account for 8/10/12 bit.
+static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
+ int ret_val = thresh;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8: ret_val = thresh; break;
+ case VPX_BITS_10: ret_val = thresh << 4; break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = thresh << 8;
+ break;
+ }
+ }
+#else
+ (void)cm;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return ret_val;
+}
+
+// This threshold is used to track blocks where to all intents and purposes
+// the intra prediction error 0. Though the metric we test against
+// is technically a sse we are mainly interested in blocks where all the pixels
+// in the 8 bit domain have an error of <= 1 (where error = sse) so a
+// linear scaling for 10 and 12 bit gives similar results.
+#define UL_INTRA_THRESH 50
+static int get_ul_intra_threshold(VP9_COMMON *cm) {
+ int ret_val = UL_INTRA_THRESH;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break;
+ case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = UL_INTRA_THRESH << 4;
+ break;
+ }
+ }
+#else
+ (void)cm;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return ret_val;
+}
+
+#define SMOOTH_INTRA_THRESH 4000
+static int get_smooth_intra_threshold(VP9_COMMON *cm) {
+ int ret_val = SMOOTH_INTRA_THRESH;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break;
+ case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = SMOOTH_INTRA_THRESH << 8;
+ break;
+ }
+ }
+#else
+ (void)cm;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return ret_val;
+}
+
+#define FP_DN_THRESH 8
+#define FP_MAX_DN_THRESH 24
+#define KERNEL_SIZE 3
+
+// Baseline Kernal weights for first pass noise metric
+static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+ 2, 1, 2, 1 };
+
+// Estimate noise at a single point based on the impace of a spatial kernal
+// on the point value
+static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
+ int sum_weight = 0;
+ int sum_val = 0;
+ int i, j;
+ int max_diff = 0;
+ int diff;
+ int dn_diff;
+ uint8_t *tmp_ptr;
+ uint8_t *kernal_ptr;
+ uint8_t dn_val;
+ uint8_t centre_val = *src_ptr;
+
+ kernal_ptr = fp_dn_kernal_3;
+
+ // Apply the kernal
+ tmp_ptr = src_ptr - stride - 1;
+ for (i = 0; i < KERNEL_SIZE; ++i) {
+ for (j = 0; j < KERNEL_SIZE; ++j) {
+ diff = abs((int)centre_val - (int)tmp_ptr[j]);
+ max_diff = VPXMAX(max_diff, diff);
+ if (diff <= FP_DN_THRESH) {
+ sum_weight += *kernal_ptr;
+ sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+ }
+ ++kernal_ptr;
+ }
+ tmp_ptr += stride;
+ }
+
+ if (max_diff < FP_MAX_DN_THRESH)
+ // Update the source value with the new filtered value
+ dn_val = (sum_val + (sum_weight >> 1)) / sum_weight;
+ else
+ dn_val = *src_ptr;
+
+ // return the noise energy as the square of the difference between the
+ // denoised and raw value.
+ dn_diff = (int)*src_ptr - (int)dn_val;
+ return dn_diff * dn_diff;
+}
+#if CONFIG_VP9_HIGHBITDEPTH
+static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
+ int sum_weight = 0;
+ int sum_val = 0;
+ int i, j;
+ int max_diff = 0;
+ int diff;
+ int dn_diff;
+ uint8_t *tmp_ptr;
+ uint16_t *tmp_ptr16;
+ uint8_t *kernal_ptr;
+ uint16_t dn_val;
+ uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
+
+ kernal_ptr = fp_dn_kernal_3;
+
+ // Apply the kernal
+ tmp_ptr = src_ptr - stride - 1;
+ for (i = 0; i < KERNEL_SIZE; ++i) {
+ tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
+ for (j = 0; j < KERNEL_SIZE; ++j) {
+ diff = abs((int)centre_val - (int)tmp_ptr16[j]);
+ max_diff = VPXMAX(max_diff, diff);
+ if (diff <= FP_DN_THRESH) {
+ sum_weight += *kernal_ptr;
+ sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+ }
+ ++kernal_ptr;
+ }
+ tmp_ptr += stride;
+ }
+
+ if (max_diff < FP_MAX_DN_THRESH)
+ // Update the source value with the new filtered value
+ dn_val = (sum_val + (sum_weight >> 1)) / sum_weight;
+ else
+ dn_val = *CONVERT_TO_SHORTPTR(src_ptr);
+
+ // return the noise energy as the square of the difference between the
+ // denoised and raw value.
+ dn_diff = (int)(*CONVERT_TO_SHORTPTR(src_ptr)) - (int)dn_val;
+ return dn_diff * dn_diff;
+}
+#endif
+
+// Estimate noise for a block.
+static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ MACROBLOCKD *xd = &x->e_mbd;
+#endif
+ uint8_t *src_ptr = &x->plane[0].src.buf[0];
+ const int width = num_4x4_blocks_wide_lookup[bsize] * 4;
+ const int height = num_4x4_blocks_high_lookup[bsize] * 4;
+ int w, h;
+ int stride = x->plane[0].src.stride;
+ int block_noise = 0;
+
+ // Sampled points to reduce cost overhead.
+ for (h = 0; h < height; h += 2) {
+ for (w = 0; w < width; w += 2) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ block_noise += fp_highbd_estimate_point_noise(src_ptr, stride);
+ else
+ block_noise += fp_estimate_point_noise(src_ptr, stride);
+#else
+ block_noise += fp_estimate_point_noise(src_ptr, stride);
+#endif
+ ++src_ptr;
+ }
+ src_ptr += (stride - width);
+ }
+ return block_noise << 2; // Scale << 2 to account for sampling.
+}
+
+// This function is called to test the functionality of row based
+// multi-threading in unit tests for bit-exactness
+static void accumulate_floating_point_stats(VP9_COMP *cpi,
+ TileDataEnc *first_tile_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_row, mb_col;
+ first_tile_col->fp_data.intra_factor = 0;
+ first_tile_col->fp_data.brightness_factor = 0;
+ first_tile_col->fp_data.neutral_count = 0;
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+ first_tile_col->fp_data.intra_factor +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;
+ first_tile_col->fp_data.brightness_factor +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;
+ first_tile_col->fp_data.neutral_count +=
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;
+ }
+ }
+}
+
+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
+ FIRSTPASS_DATA *fp_acc_data) {
+ VP9_COMMON *const cm = &cpi->common;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const double min_err = 200 * sqrt(num_mbs);
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||
+ (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+ fp_acc_data->image_data_start_row = cm->mb_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (fp_acc_data->image_data_start_row > 0) {
+ fp_acc_data->intra_skip_count =
+ VPXMAX(0, fp_acc_data->intra_skip_count -
+ (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+ }
+
+ fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
+ fp_acc_data->brightness_factor =
+ fp_acc_data->brightness_factor / (double)num_mbs;
+ fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;
+
+ fps->frame = cm->current_video_frame;
+ fps->spatial_layer_id = cpi->svc.spatial_layer_id;
+
+ fps->coded_error =
+ ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+ fps->sr_coded_error =
+ ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+ fps->intra_error =
+ ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
+ fps->frame_noise_energy =
+ (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
+ fps->count = 1.0;
+ fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;
+ fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;
+ fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;
+ fps->pcnt_intra_low = (double)(fp_acc_data->intra_count_low) / num_mbs;
+ fps->pcnt_intra_high = (double)(fp_acc_data->intra_count_high) / num_mbs;
+ fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;
+ fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;
+ fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);
+ // Currently set to 0 as most issues relate to letter boxing.
+ fps->inactive_zone_cols = (double)0;
+
+ if (fp_acc_data->mvcount > 0) {
+ fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
+ fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
+ fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
+ fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;
+ fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -
+ ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /
+ fp_acc_data->mvcount)) /
+ fp_acc_data->mvcount;
+ fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -
+ ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /
+ fp_acc_data->mvcount)) /
+ fp_acc_data->mvcount;
+ fps->mv_in_out_count =
+ (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
+ fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
+ } else {
+ fps->MVr = 0.0;
+ fps->mvr_abs = 0.0;
+ fps->MVc = 0.0;
+ fps->mvc_abs = 0.0;
+ fps->MVrv = 0.0;
+ fps->MVcv = 0.0;
+ fps->mv_in_out_count = 0.0;
+ fps->pcnt_motion = 0.0;
+ }
+}
+
+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
+ FIRSTPASS_DATA *fp_acc_data) {
+ this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;
+ this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;
+ this_tile->fp_data.coded_error += fp_acc_data->coded_error;
+ this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;
+ this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;
+ this_tile->fp_data.intra_error += fp_acc_data->intra_error;
+ this_tile->fp_data.intercount += fp_acc_data->intercount;
+ this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;
+ this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;
+ this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
+ this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
+ this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+ this_tile->fp_data.mvcount += fp_acc_data->mvcount;
+ this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
+ this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
+ this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;
+ this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;
+ this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;
+ this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;
+ this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;
+ this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;
+ this_tile->fp_data.image_data_start_row =
+ VPXMIN(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row) == INVALID_ROW
+ ? VPXMAX(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row)
+ : VPXMIN(this_tile->fp_data.image_data_start_row,
+ fp_acc_data->image_data_start_row);
+}
+
+#if CONFIG_RATE_CTRL
+static void store_fp_motion_vector(VP9_COMP *cpi, const MV *mv,
+ const int mb_row, const int mb_col,
+ MV_REFERENCE_FRAME frame_type,
+ const int mv_idx) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+ MOTION_VECTOR_INFO *this_motion_vector_info =
+ &cpi->fp_motion_vector_info[mb_index];
+ this_motion_vector_info->ref_frame[mv_idx] = frame_type;
+ if (frame_type != INTRA_FRAME) {
+ this_motion_vector_info->mv[mv_idx].as_mv = *mv;
+ }
+}
+#endif // CONFIG_RATE_CTRL
+
+#define NZ_MOTION_PENALTY 128
+#define INTRA_MODE_PENALTY 1024
+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
+ FIRSTPASS_DATA *fp_acc_data,
+ TileDataEnc *tile_data, MV *best_ref_mv,
+ int mb_row) {
+ int mb_col;
+ MACROBLOCK *const x = &td->mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo tile = tile_data->tile_info;
+ const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1);
+ const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1);
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
+ int i, c;
+ int num_mb_cols = get_num_cols(tile_data->tile_info, 1);
+
+ int recon_yoffset, recon_uvoffset;
+ const int intrapenalty = INTRA_MODE_PENALTY;
+ const MV zero_mv = { 0, 0 };
+ int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+ const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+ MODE_INFO mi_above, mi_left;
+
+ double mb_intra_factor;
+ double mb_brightness_factor;
+ double mb_neutral_count;
+ int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
+
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+ xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start;
+ xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ p[i].coeff = ctx->coeff_pbuf[i][1];
+ p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+ pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+ p[i].eobs = ctx->eobs_pbuf[i][1];
+ }
+
+ recon_y_stride = new_yv12->y_stride;
+ recon_uv_stride = new_yv12->uv_stride;
+ uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+ // Reset above block coeffs.
+ recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16;
+ recon_uvoffset =
+ (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height;
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.row_max =
+ ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+ for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) {
+ int this_error;
+ int this_intra_error;
+ const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+ const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+ double log_intra;
+ int level_sample;
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+
+ (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
+
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf = cpi->Source->y_buffer +
+ mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
+ x->plane[1].src.buf = cpi->Source->u_buffer +
+ mb_row * uv_mb_height * x->plane[1].src.stride +
+ mb_col * uv_mb_height;
+ x->plane[2].src.buf = cpi->Source->v_buffer +
+ mb_row * uv_mb_height * x->plane[1].src.stride +
+ mb_col * uv_mb_height;
+
+ vpx_clear_system_state();
+
+ xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+ xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+ xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+ mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,
+ cm->mi_cols);
+ // Are edges available for intra prediction?
+ // Since the firstpass does not populate the mi_grid_visible,
+ // above_mi/left_mi must be overwritten with a nonzero value when edges
+ // are available. Required by vp9_predict_intra_block().
+ xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+ xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;
+
+ // Do intra 16x16 prediction.
+ x->skip_encode = 0;
+ x->fp_src_pred = 0;
+ // Do intra prediction based on source pixels for tile boundaries
+ if (mb_col == mb_col_start && mb_col != 0) {
+ xd->left_mi = &mi_left;
+ x->fp_src_pred = 1;
+ }
+ xd->mi[0]->mode = DC_PRED;
+ xd->mi[0]->tx_size =
+ use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+ // Fix - zero the 16x16 block first. This ensures correct this_error for
+ // block sizes smaller than 16x16.
+ vp9_zero_array(x->plane[0].src_diff, 256);
+ vp9_encode_intra_block_plane(x, bsize, 0, 0);
+ this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+ this_intra_error = this_error;
+
+ // Keep a record of blocks that have very low intra error residual
+ // (i.e. are in effect completely flat and untextured in the intra
+ // domain). In natural videos this is uncommon, but it is much more
+ // common in animations, graphics and screen content, so may be used
+ // as a signal to detect these types of content.
+ if (this_error < get_ul_intra_threshold(cm)) {
+ ++(fp_acc_data->intra_skip_count);
+ } else if ((mb_col > 0) &&
+ (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+ fp_acc_data->image_data_start_row = mb_row;
+ }
+
+ // Blocks that are mainly smooth in the intra domain.
+ // Some special accounting for CQ but also these are better for testing
+ // noise levels.
+ if (this_error < get_smooth_intra_threshold(cm)) {
+ ++(fp_acc_data->intra_smooth_count);
+ }
+
+ // Special case noise measurement for first frame.
+ if (cm->current_video_frame == 0) {
+ if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ } else {
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ }
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ switch (cm->bit_depth) {
+ case VPX_BITS_8: break;
+ case VPX_BITS_10: this_error >>= 4; break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ this_error >>= 8;
+ break;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ vpx_clear_system_state();
+ log_intra = log(this_error + 1.0);
+ if (log_intra < 10.0) {
+ mb_intra_factor = 1.0 + ((10.0 - log_intra) * 0.05);
+ fp_acc_data->intra_factor += mb_intra_factor;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
+ mb_intra_factor;
+ } else {
+ fp_acc_data->intra_factor += 1.0;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ else
+ level_sample = x->plane[0].src.buf[0];
+#else
+ level_sample = x->plane[0].src.buf[0];
+#endif
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+ mb_brightness_factor = 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ fp_acc_data->brightness_factor += mb_brightness_factor;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+ mb_brightness_factor;
+ } else {
+ fp_acc_data->brightness_factor += 1.0;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+ 1.0;
+ }
+
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
+
+ // Accumulate the intra error.
+ fp_acc_data->intra_error += (int64_t)this_error;
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.col_max =
+ ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+ // Other than for intra-only frame do a motion search.
+ if (!frame_is_intra_only(cm)) {
+ int tmp_err, motion_error, this_motion_error, raw_motion_error;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+ struct buf_2d unscaled_last_source_buf_2d;
+ vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ // Store zero mv as default
+ store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+ }
+#endif // CONFIG_RAGE_CTRL
+
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ this_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8);
+ } else {
+ motion_error =
+ get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+ this_motion_error = motion_error;
+ }
+#else
+ motion_error =
+ get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+ this_motion_error = motion_error;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is very small.
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + recon_yoffset;
+ unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ raw_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+ } else {
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+ }
+#else
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (raw_motion_error > NZ_MOTION_PENALTY) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+ v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ this_motion_error =
+ vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if (!is_zero_mv(best_ref_mv)) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
+ this_motion_error =
+ vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0);
+ }
+ }
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ store_fp_motion_vector(cpi, &mv, mb_row, mb_col, LAST_FRAME, 0);
+ }
+#endif // CONFIG_RAGE_CTRL
+
+ // Search in an older reference frame.
+ if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+ // Assume 0,0 motion with no mv overhead.
+ int gf_motion_error;
+
+ xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ gf_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+ }
+#else
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ store_fp_motion_vector(cpi, &tmp_mv, mb_row, mb_col, GOLDEN_FRAME,
+ 1);
+ }
+#endif // CONFIG_RAGE_CTRL
+
+ if (gf_motion_error < motion_error && gf_motion_error < this_error)
+ ++(fp_acc_data->second_ref_count);
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if (gf_motion_error < this_error)
+ fp_acc_data->sr_coded_error += gf_motion_error;
+ else
+ fp_acc_data->sr_coded_error += this_error;
+ } else {
+ fp_acc_data->sr_coded_error += motion_error;
+ }
+ } else {
+ fp_acc_data->sr_coded_error += motion_error;
+ }
+
+ // Start by assuming that intra mode is best.
+ best_ref_mv->row = 0;
+ best_ref_mv->col = 0;
+
+ if (motion_error <= this_error) {
+ vpx_clear_system_state();
+
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+ (this_error < (2 * intrapenalty))) {
+ fp_acc_data->neutral_count += 1.0;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+ 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+ (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ mb_neutral_count =
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+ fp_acc_data->neutral_count += mb_neutral_count;
+ if (cpi->row_mt_bit_exact)
+ cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+ mb_neutral_count;
+ }
+
+ mv.row *= 8;
+ mv.col *= 8;
+ this_error = motion_error;
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = mv;
+ xd->mi[0]->tx_size = TX_4X4;
+ xd->mi[0]->ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE;
+ vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+ vp9_encode_sby_pass1(x, bsize);
+ fp_acc_data->sum_mvr += mv.row;
+ fp_acc_data->sum_mvr_abs += abs(mv.row);
+ fp_acc_data->sum_mvc += mv.col;
+ fp_acc_data->sum_mvc_abs += abs(mv.col);
+ fp_acc_data->sum_mvrs += mv.row * mv.row;
+ fp_acc_data->sum_mvcs += mv.col * mv.col;
+ ++(fp_acc_data->intercount);
+
+ *best_ref_mv = mv;
+
+ if (!is_zero_mv(&mv)) {
+ ++(fp_acc_data->mvcount);
+
+ // Does the row vector point inwards or outwards?
+ if (mb_row < cm->mb_rows / 2) {
+ if (mv.row > 0)
+ --(fp_acc_data->sum_in_vectors);
+ else if (mv.row < 0)
+ ++(fp_acc_data->sum_in_vectors);
+ } else if (mb_row > cm->mb_rows / 2) {
+ if (mv.row > 0)
+ ++(fp_acc_data->sum_in_vectors);
+ else if (mv.row < 0)
+ --(fp_acc_data->sum_in_vectors);
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < cm->mb_cols / 2) {
+ if (mv.col > 0)
+ --(fp_acc_data->sum_in_vectors);
+ else if (mv.col < 0)
+ ++(fp_acc_data->sum_in_vectors);
+ } else if (mb_col > cm->mb_cols / 2) {
+ if (mv.col > 0)
+ ++(fp_acc_data->sum_in_vectors);
+ else if (mv.col < 0)
+ --(fp_acc_data->sum_in_vectors);
+ }
+ }
+ if (this_intra_error < scaled_low_intra_thresh) {
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ } else {
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ }
+ } else { // Intra < inter error
+ if (this_intra_error < scaled_low_intra_thresh) {
+ fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+ if (this_motion_error < scaled_low_intra_thresh) {
+ fp_acc_data->intra_count_low += 1.0;
+ } else {
+ fp_acc_data->intra_count_high += 1.0;
+ }
+ } else {
+ fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+ fp_acc_data->intra_count_high += 1.0;
+ }
+ }
+ } else {
+ fp_acc_data->sr_coded_error += (int64_t)this_error;
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ store_fp_motion_vector(cpi, NULL, mb_row, mb_col, INTRA_FRAME, 0);
+ }
+#endif // CONFIG_RAGE_CTRL
+ }
+ fp_acc_data->coded_error += (int64_t)this_error;
+
+ recon_yoffset += 16;
+ recon_uvoffset += uv_mb_height;
+
+ // Accumulate row level stats to the corresponding tile stats
+ if (cpi->row_mt && mb_col == mb_col_end - 1)
+ accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
+
+ (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
+ num_mb_cols);
+ }
+ vpx_clear_system_state();
+}
+
+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_row;
+ TileDataEnc tile_data;
+ TileInfo *tile = &tile_data.tile_info;
+ MV zero_mv = { 0, 0 };
+ MV best_ref_mv;
+ // Tiling is ignored in the first pass.
+ vp9_tile_init(tile, cm, 0, 0);
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ fp_motion_vector_info_reset(cpi->frame_info.frame_width,
+ cpi->frame_info.frame_height,
+ cpi->fp_motion_vector_info);
+ }
+#endif
+
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ best_ref_mv = zero_mv;
+ vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,
+ &best_ref_mv, mb_row);
+ }
+}
+
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TWO_PASS *twopass = &cpi->twopass;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+ const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+ BufferPool *const pool = cm->buffer_pool;
+
+ FIRSTPASS_DATA fp_temp_data;
+ FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;
+
+ vpx_clear_system_state();
+ vp9_zero(fp_temp_data);
+ fp_acc_data->image_data_start_row = INVALID_ROW;
+
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+ set_first_pass_params(cpi);
+ vp9_set_quantizer(cpi, find_fp_qindex(cm->bit_depth));
+
+ vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+ vp9_setup_src_planes(x, cpi->Source, 0, 0);
+ vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
+
+ if (!frame_is_intra_only(cm)) {
+ vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+ vp9_frame_init_quantizer(cpi);
+
+ x->skip_recode = 0;
+
+ vp9_init_mv_probs(cm);
+ vp9_initialize_rd_consts(cpi);
+
+ cm->log2_tile_rows = 0;
+
+ if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->twopass.fp_mb_float_stats,
+ vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
+
+ {
+ FIRSTPASS_STATS fps;
+ TileDataEnc *first_tile_col;
+ if (!cpi->row_mt) {
+ cm->log2_tile_cols = 0;
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+ first_pass_encode(cpi, fp_acc_data);
+ first_pass_stat_calc(cpi, &fps, fp_acc_data);
+ } else {
+ cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+ cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+ if (cpi->row_mt_bit_exact) {
+ cm->log2_tile_cols = 0;
+ vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
+ }
+ vp9_encode_fp_row_mt(cpi);
+ first_tile_col = &cpi->tile_data[0];
+ if (cpi->row_mt_bit_exact)
+ accumulate_floating_point_stats(cpi, first_tile_col);
+ first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
+ }
+
+ // Dont allow a value of 0 for duration.
+ // (Section duration is also defaulted to minimum of 1.0).
+ fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
+
+ // Don't want to do output stats with a stack variable!
+ twopass->this_frame_stats = fps;
+ output_stats(&twopass->this_frame_stats);
+ accumulate_stats(&twopass->total_stats, &fps);
+ }
+
+ // Copy the previous Last Frame back into gf and and arf buffers if
+ // the prediction is good enough... but also don't allow it to lag too far.
+ if ((twopass->sr_update_lag > 3) ||
+ ((cm->current_video_frame > 0) &&
+ (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+ ((twopass->this_frame_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+ if (gld_yv12 != NULL) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
+ }
+ twopass->sr_update_lag = 1;
+ } else {
+ ++twopass->sr_update_lag;
+ }
+
+ vpx_extend_frame_borders(new_yv12);
+
+ // The frame we just compressed now becomes the last frame.
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+ cm->new_fb_idx);
+
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
+ if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+ cm->ref_frame_map[cpi->lst_fb_idx]);
+ }
+
+ // Use this to see what the first pass reconstruction looks like.
+ if (0) {
+ char filename[512];
+ FILE *recon_file;
+ snprintf(filename, sizeof(filename), "enc%04d.yuv",
+ (int)cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+
+ // In the first pass, every frame is considered as a show frame.
+ update_frame_indexes(cm, /*show_frame=*/1);
+ if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
+}
+
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+ 0.85, 0.90, 0.90,
+ 0.90, 1.00, 1.25 };
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+ int q) {
+ const double error_term = err_per_mb / DOUBLE_DIVIDE_CHECK(err_divisor);
+ const int index = q >> 5;
+ double power_term;
+
+ assert((index >= 0) && (index < (QINDEX_RANGE >> 5)));
+
+ // Adjustment based on quantizer to the power term.
+ power_term =
+ q_pow_term[index] +
+ (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0);
+
+ // Calculate correction factor.
+ if (power_term < 1.0) assert(error_term >= 0.0);
+
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+static double wq_err_divisor(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ unsigned int screen_area = (cm->width * cm->height);
+
+ // Use a different error per mb factor for calculating boost for
+ // different formats.
+ if (screen_area <= 640 * 360) {
+ return 115.0;
+ } else if (screen_area < 1280 * 720) {
+ return 125.0;
+ } else if (screen_area <= 1920 * 1080) {
+ return 130.0;
+ } else if (screen_area < 3840 * 2160) {
+ return 150.0;
+ }
+
+ // Fall through to here only for 4K and above.
+ return 200.0;
+}
+
+#define NOISE_FACTOR_MIN 0.9
+#define NOISE_FACTOR_MAX 1.1
+static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
+ double inactive_zone, double section_noise,
+ int section_target_bandwidth) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ double last_group_rate_err;
+
+ // Clamp the target rate to VBR min / max limts.
+ const int target_rate =
+ vp9_rc_clamp_pframe_target_size(cpi, section_target_bandwidth);
+ double noise_factor = pow((section_noise / SECTION_NOISE_DEF), 0.5);
+ noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX);
+ inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+// TODO(jimbankoski): remove #if here or below when this has been
+// well tested.
+#if CONFIG_ALWAYS_ADJUST_BPM
+ // based on recent history adjust expectations of bits per macroblock.
+ last_group_rate_err =
+ (double)twopass->rolling_arf_group_actual_bits /
+ DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+ last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+ twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+ twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
+
+ if (target_rate <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+ const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+ const double av_err_per_mb = section_err / active_pct;
+ const double speed_term = 1.0 + 0.04 * oxcf->speed;
+ const int target_norm_bits_per_mb =
+ (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
+ int q;
+
+// TODO(jimbankoski): remove #if here or above when this has been
+// well tested.
+#if !CONFIG_ALWAYS_ADJUST_BPM
+ // based on recent history adjust expectations of bits per macroblock.
+ last_group_rate_err =
+ (double)twopass->rolling_arf_group_actual_bits /
+ DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+ last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+ twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+ twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+ const double factor =
+ calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q);
+ const int bits_per_mb = vp9_rc_bits_per_mb(
+ INTER_FRAME, q,
+ factor * speed_term * cpi->twopass.bpm_factor * noise_factor,
+ cpi->common.bit_depth);
+ if (bits_per_mb <= target_norm_bits_per_mb) break;
+ }
+
+ // Restriction on active max q for constrained quality mode.
+ if (cpi->oxcf.rc_mode == VPX_CQ) q = VPXMAX(q, oxcf->cq_level);
+ return q;
+ }
+}
+
+static void setup_rf_level_maxq(VP9_COMP *cpi) {
+ int i;
+ RATE_CONTROL *const rc = &cpi->rc;
+ for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+ int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
+ rc->rf_level_maxq[i] = VPXMAX(rc->worst_quality + qdelta, rc->best_quality);
+ }
+}
+
+static void init_subsampling(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int w = cm->width;
+ const int h = cm->height;
+ int i;
+
+ for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+ // Note: Frames with odd-sized dimensions may result from this scaling.
+ rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+ rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+ }
+
+ setup_rf_level_maxq(cpi);
+}
+
+void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
+ int *scaled_frame_height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+ *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
+
+void vp9_init_second_pass(VP9_COMP *cpi) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ zero_stats(&twopass->total_stats);
+ zero_stats(&twopass->total_left_stats);
+
+ if (!twopass->stats_in_end) return;
+
+ stats = &twopass->total_stats;
+
+ *stats = *twopass->stats_in_end;
+ twopass->total_left_stats = *stats;
+
+ // Scan the first pass file and calculate a modified score for each
+ // frame that is used to distribute bits. The modified score is assumed
+ // to provide a linear basis for bit allocation. I.e a frame A with a score
+ // that is double that of frame B will be allocated 2x as many bits.
+ {
+ double modified_score_total = 0.0;
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double av_err;
+
+ if (oxcf->vbr_corpus_complexity) {
+ twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0;
+ av_err = get_distribution_av_err(cpi, twopass);
+ } else {
+ av_err = get_distribution_av_err(cpi, twopass);
+ // The first scan is unclamped and gives a raw average.
+ while (s < twopass->stats_in_end) {
+ modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
+ ++s;
+ }
+
+ // The average error from this first scan is used to define the midpoint
+ // error for the rate distribution function.
+ twopass->mean_mod_score =
+ modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+ }
+
+ // Second scan using clamps based on the previous cycle average.
+ // This may modify the total and average somewhat but we dont bother with
+ // further itterations.
+ modified_score_total = 0.0;
+ s = twopass->stats_in;
+ while (s < twopass->stats_in_end) {
+ modified_score_total +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err);
+ ++s;
+ }
+ twopass->normalized_score_left = modified_score_total;
+
+ // If using Corpus wide VBR mode then update the clip target bandwidth to
+ // reflect how the clip compares to the rest of the corpus.
+ if (oxcf->vbr_corpus_complexity) {
+ oxcf->target_bandwidth =
+ (int64_t)((double)oxcf->target_bandwidth *
+ (twopass->normalized_score_left / stats->count));
+ }
+
+#if COMPLEXITY_STATS_OUTPUT
+ {
+ FILE *compstats;
+ compstats = fopen("complexity_stats.stt", "a");
+ fprintf(compstats, "%10.3lf\n",
+ twopass->normalized_score_left / stats->count);
+ fclose(compstats);
+ }
+#endif
+ }
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ vp9_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Reset the vbr bits off target counters
+ rc->vbr_bits_off_target = 0;
+ rc->vbr_bits_off_target_fast = 0;
+ rc->rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ // Initialize bits per macro_block estimate correction factor.
+ twopass->bpm_factor = 1.0;
+ // Initialize actual and target bits counters for ARF groups so that
+ // at the start we have a neutral bpm adjustment.
+ twopass->rolling_arf_group_target_bits = 1;
+ twopass->rolling_arf_group_actual_bits = 1;
+
+ if (oxcf->resize_mode != RESIZE_NONE) {
+ init_subsampling(cpi);
+ }
+
+ // Initialize the arnr strangth adjustment to 0
+ twopass->arnr_strength_adjustment = 0;
+}
+
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It compares the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitrate boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const TWO_PASS *const twopass,
+ const FIRSTPASS_STATS *frame) {
+ double sr_diff = (frame->sr_coded_error - frame->coded_error);
+ double sr_decay = 1.0;
+
+ // Do nothing if the second ref to last frame error difference is
+ // very small or even negative.
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ const double sr_diff_part =
+ twopass->sr_diff_factor * ((sr_diff * 0.25) / frame->intra_error);
+ double modified_pct_inter = frame->pcnt_inter;
+ double modified_pcnt_intra;
+
+ if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+ ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH)) {
+ modified_pct_inter =
+ frame->pcnt_inter + frame->pcnt_intra_low - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
+ }
+ return VPXMAX(sr_decay, twopass->sr_default_decay_limit);
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const TWO_PASS *const twopass,
+ const FIRSTPASS_STATS *frame_stats) {
+ const double zero_motion_pct =
+ frame_stats->pcnt_inter - frame_stats->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(twopass, frame_stats);
+ return VPXMIN(sr_decay, zero_motion_pct);
+}
+
+static double get_prediction_decay_rate(const TWO_PASS *const twopass,
+ const FIRSTPASS_STATS *frame_stats) {
+ const double sr_decay_rate = get_sr_decay_rate(twopass, frame_stats);
+ double zero_motion_factor =
+ twopass->zm_factor * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
+
+ // Check that the zero motion factor is valid
+ assert(zero_motion_factor >= 0.0 && zero_motion_factor <= 1.0);
+
+ return VPXMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+static int get_show_idx(const TWO_PASS *twopass) {
+ return (int)(twopass->stats_in - twopass->stats_in_start);
+}
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int check_transition_to_still(const FIRST_PASS_INFO *first_pass_info,
+ int show_idx, int still_interval) {
+ int j;
+ int num_frames = fps_get_num_frames(first_pass_info);
+ if (show_idx + still_interval > num_frames) {
+ return 0;
+ }
+
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats =
+ fps_get_frame_stats(first_pass_info, show_idx + j);
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // useage or a second ref coded error notabley lower than the last
+ // frame coded error.
+ if (frame_stats == NULL) {
+ return 0;
+ }
+ return (frame_stats->sr_coded_error < frame_stats->coded_error) ||
+ ((frame_stats->pcnt_second_ref > frame_stats->pcnt_inter) &&
+ (frame_stats->pcnt_second_ref >= 0.5));
+}
+
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+ const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+ return detect_flash_from_frame_stats(next_frame);
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ double *mv_in_out,
+ double *mv_in_out_accumulator,
+ double *abs_mv_in_out_accumulator,
+ double *mv_ratio_accumulator) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ *mv_in_out = stats->mv_in_out_count * pct;
+ *mv_in_out_accumulator += *mv_in_out;
+ *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ *mv_ratio_accumulator +=
+ pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+ *mv_ratio_accumulator +=
+ pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+ }
+}
+
+static double calc_frame_boost(const FRAME_INFO *frame_info,
+ const FIRSTPASS_STATS *this_frame,
+ const TWO_PASS *const twopass,
+ int avg_frame_qindex,
+ double this_frame_mv_in_out) {
+ double frame_boost;
+ const double lq =
+ vp9_convert_qindex_to_q(avg_frame_qindex, frame_info->bit_depth);
+ const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
+ const double active_area = calculate_active_area(frame_info, this_frame);
+
+ // Frame booost is based on inter error.
+ frame_boost = (twopass->err_per_mb * active_area) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+
+ // Small adjustment for cases where there is a zoom out
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+ // Q correction and scalling
+ frame_boost = frame_boost * boost_q_correction;
+
+ return VPXMIN(frame_boost, twopass->gf_frame_max_boost * boost_q_correction);
+}
+
+static double calc_kf_frame_boost(VP9_COMP *cpi,
+ const FIRSTPASS_STATS *this_frame,
+ double *sr_accumulator,
+ double this_frame_mv_in_out,
+ double zm_factor) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ double frame_boost;
+ const double lq = vp9_convert_qindex_to_q(
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
+ const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
+ const double active_area =
+ calculate_active_area(&cpi->frame_info, this_frame);
+ double max_boost;
+
+ // Frame booost is based on inter error.
+ frame_boost = (twopass->kf_err_per_mb * active_area) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
+ *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+
+ // Small adjustment for cases where there is a zoom out
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+
+ // Q correction and scaling
+ // The 40.0 value here is an experimentally derived baseline minimum.
+ // This value is in line with the minimum per frame boost in the alt_ref
+ // boost calculation.
+ frame_boost =
+ (frame_boost + twopass->kf_frame_min_boost) * boost_q_correction;
+
+ // Maximum allowed boost this frame. May be different for first vs subsequent
+ // key frames.
+ max_boost = (cpi->common.current_video_frame == 0)
+ ? twopass->kf_frame_max_boost_first
+ : twopass->kf_frame_max_boost_subs;
+ max_boost *= zm_factor * boost_q_correction;
+
+ return VPXMIN(frame_boost, max_boost);
+}
+
+static int compute_arf_boost(const FRAME_INFO *frame_info,
+ TWO_PASS *const twopass, int arf_show_idx,
+ int f_frames, int b_frames, int avg_frame_qindex) {
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ int arf_boost;
+ int flash_detected = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame =
+ fps_get_frame_stats(first_pass_info, arf_show_idx + i);
+ const FIRSTPASS_STATS *next_frame =
+ fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash_from_frame_stats(this_frame) ||
+ detect_flash_from_frame_stats(next_frame);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+ boost_score += decay_accumulator *
+ calc_frame_boost(frame_info, this_frame, twopass,
+ avg_frame_qindex, this_frame_mv_in_out);
+ }
+
+ arf_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame =
+ fps_get_frame_stats(first_pass_info, arf_show_idx + i);
+ const FIRSTPASS_STATS *next_frame =
+ fps_get_frame_stats(first_pass_info, arf_show_idx + i + 1);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash_from_frame_stats(this_frame) ||
+ detect_flash_from_frame_stats(next_frame);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(twopass, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+ boost_score += decay_accumulator *
+ calc_frame_boost(frame_info, this_frame, twopass,
+ avg_frame_qindex, this_frame_mv_in_out);
+ }
+ arf_boost += (int)boost_score;
+
+ if (arf_boost < ((b_frames + f_frames) * 40))
+ arf_boost = ((b_frames + f_frames) * 40);
+ arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+ return arf_boost;
+}
+
+static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
+ const FRAME_INFO *frame_info = &cpi->frame_info;
+ TWO_PASS *const twopass = &cpi->twopass;
+ const int avg_inter_frame_qindex = cpi->rc.avg_frame_qindex[INTER_FRAME];
+ int arf_show_idx = get_show_idx(twopass);
+ return compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames,
+ b_frames, avg_inter_frame_qindex);
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
+ double gf_group_err) {
+ VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+ int gop_frames =
+ rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
+ int key_frame_interval = rc->frames_since_key + rc->frames_to_key;
+ int distance_from_next_key_frame =
+ rc->frames_to_key -
+ (rc->baseline_gf_interval + rc->source_alt_ref_pending);
+ int max_gf_bits_bias = rc->avg_frame_bandwidth;
+ double gf_interval_bias_bits_normalize_factor =
+ (double)rc->baseline_gf_interval / 16;
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ // TODO(ravi): Experiment with different values of max_gf_bits_bias
+ total_group_bits +=
+ (int64_t)((double)distance_from_next_key_frame / key_frame_interval *
+ max_gf_bits_bias * gf_interval_bias_bits_normalize_factor);
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0) ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * gop_frames)
+ total_group_bits = (int64_t)max_bits * gop_frames;
+
+ return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
+
+ allocation_chunks = (frame_count * NORMAL_BOOST) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return VPXMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+// Used in corpus vbr: Calculates the total normalized group complexity score
+// for a given number of frames starting at the current position in the stats
+// file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+ int frame_count) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double score_total = 0.0;
+ int i = 0;
+
+ // We dont ever want to return a 0 score here.
+ if (frame_count == 0) return 1.0;
+
+ while ((i < frame_count) && (s < twopass->stats_in_end)) {
+ score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+ ++s;
+ ++i;
+ }
+
+ return score_total;
+}
+
+static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group,
+ int *index_counter, int depth, int start, int end) {
+ TWO_PASS *twopass = &cpi->twopass;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+ FIRSTPASS_STATS fpf_frame;
+ const int mid = (start + end + 1) >> 1;
+ const int min_frame_interval = 2;
+ int idx;
+
+ // Process regular P frames
+ if ((end - start < min_frame_interval) ||
+ (depth > gf_group->allowed_max_layer_depth)) {
+ for (idx = start; idx <= end; ++idx) {
+ gf_group->update_type[*index_counter] = LF_UPDATE;
+ gf_group->arf_src_offset[*index_counter] = 0;
+ gf_group->frame_gop_index[*index_counter] = idx;
+ gf_group->rf_level[*index_counter] = INTER_NORMAL;
+ gf_group->layer_depth[*index_counter] = depth;
+ gf_group->gfu_boost[*index_counter] = NORMAL_BOOST;
+ ++(*index_counter);
+ }
+ gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth);
+ return;
+ }
+
+ assert(abs(mid - start) >= 1 && abs(mid - end) >= 1);
+
+ // Process ARF frame
+ gf_group->layer_depth[*index_counter] = depth;
+ gf_group->update_type[*index_counter] = ARF_UPDATE;
+ gf_group->arf_src_offset[*index_counter] = mid - start;
+ gf_group->frame_gop_index[*index_counter] = mid;
+ gf_group->rf_level[*index_counter] = GF_ARF_LOW;
+
+ for (idx = 0; idx <= mid; ++idx)
+ if (EOF == input_stats(twopass, &fpf_frame)) break;
+
+ gf_group->gfu_boost[*index_counter] =
+ VPXMAX(MIN_ARF_GF_BOOST,
+ calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth);
+
+ reset_fpf_position(twopass, start_pos);
+
+ ++(*index_counter);
+
+ find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1);
+
+ gf_group->update_type[*index_counter] = USE_BUF_FRAME;
+ gf_group->arf_src_offset[*index_counter] = 0;
+ gf_group->frame_gop_index[*index_counter] = mid;
+ gf_group->rf_level[*index_counter] = INTER_NORMAL;
+ gf_group->layer_depth[*index_counter] = depth;
+ ++(*index_counter);
+
+ find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end);
+}
+
+static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group,
+ int frame_index,
+ int source_alt_ref_active) {
+ if (source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1;
+ gf_group->gfu_boost[frame_index] = NORMAL_BOOST;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ gf_group->layer_depth[frame_index] = 0;
+ }
+}
+
+static void define_gf_group_structure(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ int frame_index = 0;
+ int key_frame = cpi->common.frame_type == KEY_FRAME;
+ int layer_depth = 1;
+ int gop_frames =
+ rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
+
+ gf_group->frame_start = cpi->common.current_video_frame;
+ gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval;
+ gf_group->max_layer_depth = 0;
+ gf_group->allowed_max_layer_depth = 0;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ if (!key_frame)
+ set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active);
+
+ ++frame_index;
+
+ // === [frame_index == 1] ===
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ gf_group->layer_depth[frame_index] = layer_depth;
+ gf_group->arf_src_offset[frame_index] =
+ (unsigned char)(rc->baseline_gf_interval - 1);
+ gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+ gf_group->max_layer_depth = 1;
+ ++frame_index;
+ ++layer_depth;
+ gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf;
+ }
+
+ find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames);
+
+ set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending);
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+
+ // Set the frame ops number.
+ gf_group->gf_group_size = frame_index;
+}
+
+static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
+ int gf_arf_bits) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ FIRSTPASS_STATS frame_stats;
+ int i;
+ int frame_index = 0;
+ int target_frame_size;
+ int key_frame;
+ const int max_bits = frame_max_bits(&cpi->rc, oxcf);
+ int64_t total_group_bits = gf_group_bits;
+ int mid_frame_idx;
+ int normal_frames;
+ int normal_frame_bits;
+ int last_frame_reduction = 0;
+ double av_score = 1.0;
+ double tot_norm_frame_score = 1.0;
+ double this_frame_score = 1.0;
+
+ // Define the GF structure and specify
+ int gop_frames = gf_group->gf_group_size;
+
+ key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ if (!key_frame) {
+ gf_group->bit_allocation[frame_index] =
+ rc->source_alt_ref_active ? 0 : gf_arf_bits;
+ }
+
+ // Deduct the boost bits for arf (or gf if it is not a key frame)
+ // from the group total.
+ if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+ ++frame_index;
+
+ // === [frame_index == 1] ===
+ // Store the bits to spend on the ARF if there is one.
+ if (rc->source_alt_ref_pending) {
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ ++frame_index;
+ }
+
+ // Define middle frame
+ mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+
+ normal_frames = (rc->baseline_gf_interval - 1);
+ if (normal_frames > 1)
+ normal_frame_bits = (int)(total_group_bits / normal_frames);
+ else
+ normal_frame_bits = (int)total_group_bits;
+
+ gf_group->gfu_boost[1] = rc->gfu_boost;
+
+ if (cpi->multi_layer_arf) {
+ int idx;
+ int arf_depth_bits[MAX_ARF_LAYERS] = { 0 };
+ int arf_depth_count[MAX_ARF_LAYERS] = { 0 };
+ int arf_depth_boost[MAX_ARF_LAYERS] = { 0 };
+ int total_arfs = 1; // Account for the base layer ARF.
+
+ for (idx = 0; idx < gop_frames; ++idx) {
+ if (gf_group->update_type[idx] == ARF_UPDATE) {
+ arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx];
+ ++arf_depth_count[gf_group->layer_depth[idx]];
+ }
+ }
+
+ for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) {
+ if (arf_depth_boost[idx] == 0) break;
+ arf_depth_bits[idx] = calculate_boost_bits(
+ rc->baseline_gf_interval - total_arfs - arf_depth_count[idx],
+ arf_depth_boost[idx], total_group_bits);
+
+ total_group_bits -= arf_depth_bits[idx];
+ total_arfs += arf_depth_count[idx];
+ }
+
+ // offset the base layer arf
+ normal_frames -= (total_arfs - 1);
+ if (normal_frames > 1)
+ normal_frame_bits = (int)(total_group_bits / normal_frames);
+ else
+ normal_frame_bits = (int)total_group_bits;
+
+ target_frame_size = normal_frame_bits;
+ target_frame_size =
+ clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
+
+ // The first layer ARF has its bit allocation assigned.
+ for (idx = frame_index; idx < gop_frames; ++idx) {
+ switch (gf_group->update_type[idx]) {
+ case ARF_UPDATE:
+ gf_group->bit_allocation[idx] =
+ (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
+ gf_group->gfu_boost[idx]) /
+ arf_depth_boost[gf_group->layer_depth[idx]]);
+ break;
+ case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break;
+ default: gf_group->bit_allocation[idx] = target_frame_size; break;
+ }
+ }
+ gf_group->bit_allocation[idx] = 0;
+
+ return;
+ }
+
+ if (oxcf->vbr_corpus_complexity) {
+ av_score = get_distribution_av_err(cpi, twopass);
+ tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
+ }
+
+ // Allocate bits to the other frames in the group.
+ for (i = 0; i < normal_frames; ++i) {
+ if (EOF == input_stats(twopass, &frame_stats)) break;
+ if (oxcf->vbr_corpus_complexity) {
+ this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+ &frame_stats, av_score);
+ normal_frame_bits = (int)((double)total_group_bits *
+ (this_frame_score / tot_norm_frame_score));
+ }
+
+ target_frame_size = normal_frame_bits;
+ if ((i == (normal_frames - 1)) && (i >= 1)) {
+ last_frame_reduction = normal_frame_bits / 16;
+ target_frame_size -= last_frame_reduction;
+ }
+
+ target_frame_size =
+ clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
+
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+ ++frame_index;
+ }
+
+ // Add in some extra bits for the middle frame in the group.
+ gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
+ // Note:
+ // We need to configure the frame at the end of the sequence + 1 that will be
+ // the start frame for the next group. Otherwise prior to the call to
+ // vp9_rc_get_second_pass_params() the data will be undefined.
+}
+
+// Adjusts the ARNF filter for a GF group.
+static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
+ double section_inter,
+ double section_motion) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ double section_zeromv = section_inter - section_motion;
+
+ twopass->arnr_strength_adjustment = 0;
+
+ if (section_noise < 150) {
+ twopass->arnr_strength_adjustment -= 1;
+ if (section_noise < 75) twopass->arnr_strength_adjustment -= 1;
+ } else if (section_noise > 250)
+ twopass->arnr_strength_adjustment += 1;
+
+ if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1;
+}
+
+// Analyse and define a gf/arf group.
+#define ARF_ABS_ZOOM_THRESH 4.0
+
+#define MAX_GF_BOOST 5400
+
+typedef struct RANGE {
+ int min;
+ int max;
+} RANGE;
+
+/* get_gop_coding_frame_num() depends on several fields in RATE_CONTROL *rc as
+ * follows.
+ * Static fields:
+ * (The following fields will remain unchanged after initialization of encoder.)
+ * rc->static_scene_max_gf_interval
+ * rc->min_gf_interval
+ * twopass->sr_diff_factor
+ * twopass->sr_default_decay_limit
+ * twopass->zm_factor
+ *
+ * Dynamic fields:
+ * (The following fields will be updated before or after coding each frame.)
+ * rc->frames_to_key
+ * rc->frames_since_key
+ * rc->source_alt_ref_active
+ *
+ * Special case: if CONFIG_RATE_CTRL is true, the external arf indexes will
+ * determine the arf position.
+ *
+ * TODO(angiebird): Separate the dynamic fields and static fields into two
+ * structs.
+ */
+static int get_gop_coding_frame_num(
+ int *use_alt_ref, const FRAME_INFO *frame_info,
+ const TWO_PASS *const twopass, const RATE_CONTROL *rc,
+ int gf_start_show_idx, const RANGE *active_gf_interval,
+ double gop_intra_factor, int lag_in_frames) {
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ double loop_decay_rate = 1.00;
+ double mv_ratio_accumulator = 0.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ double sr_accumulator = 0.0;
+ // Motion breakout threshold for loop below depends on image size.
+ double mv_ratio_accumulator_thresh =
+ (frame_info->frame_height + frame_info->frame_width) / 4.0;
+ double zero_motion_accumulator = 1.0;
+ int gop_coding_frames;
+
+ *use_alt_ref = 1;
+ gop_coding_frames = 0;
+ while (gop_coding_frames < rc->static_scene_max_gf_interval &&
+ gop_coding_frames < rc->frames_to_key) {
+ const FIRSTPASS_STATS *next_next_frame;
+ const FIRSTPASS_STATS *next_frame;
+ int flash_detected;
+ ++gop_coding_frames;
+
+ next_frame = fps_get_frame_stats(first_pass_info,
+ gf_start_show_idx + gop_coding_frames);
+ if (next_frame == NULL) {
+ break;
+ }
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ next_next_frame = fps_get_frame_stats(
+ first_pass_info, gf_start_show_idx + gop_coding_frames + 1);
+ flash_detected = detect_flash_from_frame_stats(next_next_frame);
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // Monitor for static sections.
+ if ((rc->frames_since_key + gop_coding_frames - 1) > 1) {
+ zero_motion_accumulator = VPXMIN(
+ zero_motion_accumulator, get_zero_motion_factor(twopass, next_frame));
+ }
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ double last_loop_decay_rate = loop_decay_rate;
+ loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
+
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+ if (gop_coding_frames > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_loop_decay_rate < 0.9) {
+ int still_interval = 5;
+ if (check_transition_to_still(first_pass_info,
+ gf_start_show_idx + gop_coding_frames,
+ still_interval)) {
+ *use_alt_ref = 0;
+ break;
+ }
+ }
+
+ // Update the accumulator for second ref error difference.
+ // This is intended to give an indication of how much the coded error is
+ // increasing over time.
+ if (gop_coding_frames == 1) {
+ sr_accumulator += next_frame->coded_error;
+ } else {
+ sr_accumulator +=
+ (next_frame->sr_coded_error - next_frame->coded_error);
+ }
+ }
+
+ // Break out conditions.
+ // Break at maximum of active_gf_interval->max unless almost totally
+ // static.
+ //
+ // Note that the addition of a test of rc->source_alt_ref_active is
+ // deliberate. The effect of this is that after a normal altref group even
+ // if the material is static there will be one normal length GF group
+ // before allowing longer GF groups. The reason for this is that in cases
+ // such as slide shows where slides are separated by a complex transition
+ // such as a fade, the arf group spanning the transition may not be coded
+ // at a very high quality and hence this frame (with its overlay) is a
+ // poor golden frame to use for an extended group.
+ if ((gop_coding_frames >= active_gf_interval->max) &&
+ ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) {
+ break;
+ }
+ if (
+ // Don't break out with a very short interval.
+ (gop_coding_frames >= active_gf_interval->min) &&
+ // If possible dont break very close to a kf
+ ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
+ (gop_coding_frames & 0x01) && (!flash_detected) &&
+ ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+ (abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH) ||
+ (sr_accumulator > gop_intra_factor * next_frame->intra_error))) {
+ break;
+ }
+ }
+ *use_alt_ref &= zero_motion_accumulator < 0.995;
+ *use_alt_ref &= gop_coding_frames < lag_in_frames;
+ *use_alt_ref &= gop_coding_frames >= rc->min_gf_interval;
+ return gop_coding_frames;
+}
+
+static RANGE get_active_gf_inverval_range_simple(int min_gf_interval,
+ int arf_active_or_kf,
+ int frames_to_key) {
+ RANGE active_gf_interval;
+ active_gf_interval.min = min_gf_interval + arf_active_or_kf + 2;
+ active_gf_interval.max = 16 + arf_active_or_kf;
+
+ if ((active_gf_interval.max <= frames_to_key) &&
+ (active_gf_interval.max >= (frames_to_key - min_gf_interval))) {
+ active_gf_interval.min = frames_to_key / 2;
+ active_gf_interval.max = frames_to_key / 2;
+ }
+ return active_gf_interval;
+}
+
+static RANGE get_active_gf_inverval_range(
+ const FRAME_INFO *frame_info, const RATE_CONTROL *rc, int arf_active_or_kf,
+ int gf_start_show_idx, int active_worst_quality, int last_boosted_qindex) {
+ RANGE active_gf_interval;
+ int int_max_q = (int)(vp9_convert_qindex_to_q(active_worst_quality,
+ frame_info->bit_depth));
+ int q_term = (gf_start_show_idx == 0)
+ ? int_max_q / 32
+ : (int)(vp9_convert_qindex_to_q(last_boosted_qindex,
+ frame_info->bit_depth) /
+ 6);
+ active_gf_interval.min =
+ rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
+ active_gf_interval.min =
+ VPXMIN(active_gf_interval.min, rc->max_gf_interval + arf_active_or_kf);
+
+ // The value chosen depends on the active Q range. At low Q we have
+ // bits to spare and are better with a smaller interval and smaller boost.
+ // At high Q when there are few bits to spare we are better with a longer
+ // interval to spread the cost of the GF.
+ active_gf_interval.max = 11 + arf_active_or_kf + VPXMIN(5, q_term);
+
+ // Force max GF interval to be odd.
+ active_gf_interval.max = active_gf_interval.max | 0x01;
+
+ // We have: active_gf_interval.min <=
+ // rc->max_gf_interval + arf_active_or_kf.
+ if (active_gf_interval.max < active_gf_interval.min) {
+ active_gf_interval.max = active_gf_interval.min;
+ } else {
+ active_gf_interval.max =
+ VPXMIN(active_gf_interval.max, rc->max_gf_interval + arf_active_or_kf);
+ }
+
+ // Would the active max drop us out just before the near the next kf?
+ if ((active_gf_interval.max <= rc->frames_to_key) &&
+ (active_gf_interval.max >= (rc->frames_to_key - rc->min_gf_interval))) {
+ active_gf_interval.max = rc->frames_to_key / 2;
+ }
+ active_gf_interval.max =
+ VPXMAX(active_gf_interval.max, active_gf_interval.min);
+ return active_gf_interval;
+}
+
+static int get_arf_layers(int multi_layer_arf, int max_layers,
+ int coding_frame_num) {
+ assert(max_layers <= MAX_ARF_LAYERS);
+ if (multi_layer_arf) {
+ int layers = 0;
+ int i;
+ for (i = coding_frame_num; i > 0; i >>= 1) {
+ ++layers;
+ }
+ layers = VPXMIN(max_layers, layers);
+ return layers;
+ } else {
+ return 1;
+ }
+}
+
+static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ const FRAME_INFO *frame_info = &cpi->frame_info;
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+ int gop_coding_frames;
+
+ double gf_group_err = 0.0;
+ double gf_group_raw_error = 0.0;
+ double gf_group_noise = 0.0;
+ double gf_group_skip_pct = 0.0;
+ double gf_group_inactive_zone_rows = 0.0;
+ double gf_group_inter = 0.0;
+ double gf_group_motion = 0.0;
+
+ int allow_alt_ref = is_altref_enabled(cpi);
+ int use_alt_ref;
+
+ int64_t gf_group_bits;
+ int gf_arf_bits;
+ const int is_key_frame = frame_is_intra_only(cm);
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+ int is_alt_ref_flash = 0;
+
+ double gop_intra_factor;
+ int gop_frames;
+ RANGE active_gf_interval;
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (is_key_frame == 0) {
+ vp9_zero(twopass->gf_group);
+ ++rc->gop_global_index;
+ } else {
+ rc->gop_global_index = 0;
+ }
+
+ vpx_clear_system_state();
+
+ if (oxcf->use_simple_encode_api) {
+ active_gf_interval = get_active_gf_inverval_range_simple(
+ rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+ } else {
+ active_gf_interval = get_active_gf_inverval_range(
+ frame_info, rc, arf_active_or_kf, gf_start_show_idx,
+ twopass->active_worst_quality, rc->last_boosted_qindex);
+ }
+
+ if (cpi->multi_layer_arf) {
+ int arf_layers = get_arf_layers(cpi->multi_layer_arf, oxcf->enable_auto_arf,
+ active_gf_interval.max);
+ gop_intra_factor = 1.0 + 0.25 * arf_layers;
+ } else {
+ gop_intra_factor = 1.0;
+ }
+
+ gop_coding_frames = get_gop_coding_frame_num(
+ &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
+ &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+ use_alt_ref &= allow_alt_ref;
+#if CONFIG_RATE_CTRL
+ // If the external gop_command is on, we will override the decisions
+ // of gop_coding_frames and use_alt_ref.
+ if (cpi->oxcf.use_simple_encode_api) {
+ const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command;
+ assert(allow_alt_ref == 1);
+ if (gop_command->use) {
+ gop_coding_frames = gop_command_coding_frame_count(gop_command);
+ use_alt_ref = gop_command->use_alt_ref;
+ }
+ }
+#endif
+ // If the external rate control model for GOP is used, the gop decisions
+ // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
+ // will be overwritten.
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+ vpx_codec_err_t codec_status;
+ vpx_rc_gop_decision_t gop_decision;
+ vpx_rc_gop_info_t gop_info;
+ gop_info.min_gf_interval = rc->min_gf_interval;
+ gop_info.max_gf_interval = rc->max_gf_interval;
+ gop_info.active_min_gf_interval = active_gf_interval.min;
+ gop_info.active_max_gf_interval = active_gf_interval.max;
+ gop_info.allow_alt_ref = allow_alt_ref;
+ gop_info.is_key_frame = is_key_frame;
+ gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
+ gop_info.frames_since_key = rc->frames_since_key;
+ gop_info.frames_to_key = rc->frames_to_key;
+ gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
+ gop_info.show_index = cm->current_video_frame;
+ gop_info.coding_index = cm->current_frame_coding_index;
+ gop_info.gop_global_index = rc->gop_global_index;
+
+ codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
+ &gop_decision);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_get_gop_decision() failed");
+ }
+ gop_coding_frames = gop_decision.gop_coding_frames;
+ use_alt_ref = gop_decision.use_alt_ref;
+ }
+
+ // Was the group length constrained by the requirement for a new KF?
+ rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
+
+ // Should we use the alternate reference frame.
+ if (use_alt_ref) {
+ const int f_frames =
+ (rc->frames_to_key - gop_coding_frames >= gop_coding_frames - 1)
+ ? gop_coding_frames - 1
+ : VPXMAX(0, rc->frames_to_key - gop_coding_frames);
+ const int b_frames = gop_coding_frames - 1;
+ const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME];
+ // TODO(angiebird): figure out why arf's location is assigned this way
+ const int arf_show_idx = VPXMIN(gf_start_show_idx + gop_coding_frames + 1,
+ fps_get_num_frames(first_pass_info));
+
+ // Calculate the boost for alt ref.
+ rc->gfu_boost =
+ compute_arf_boost(frame_info, twopass, arf_show_idx, f_frames, b_frames,
+ avg_inter_frame_qindex);
+ rc->source_alt_ref_pending = 1;
+ } else {
+ const int f_frames = gop_coding_frames - 1;
+ const int b_frames = 0;
+ const int avg_inter_frame_qindex = rc->avg_frame_qindex[INTER_FRAME];
+ // TODO(angiebird): figure out why arf's location is assigned this way
+ const int gld_show_idx =
+ VPXMIN(gf_start_show_idx + 1, fps_get_num_frames(first_pass_info));
+ const int arf_boost =
+ compute_arf_boost(frame_info, twopass, gld_show_idx, f_frames, b_frames,
+ avg_inter_frame_qindex);
+ rc->gfu_boost = VPXMIN((int)twopass->gf_max_total_boost, arf_boost);
+ rc->source_alt_ref_pending = 0;
+ }
+
+#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2
+ rc->arf_active_best_quality_adjustment_factor = 1.0;
+ rc->arf_increase_active_best_quality = 0;
+
+ if (!is_lossless_requested(&cpi->oxcf)) {
+ if (rc->frames_since_key >= rc->frames_to_key) {
+ // Increase the active best quality in the second half of key frame
+ // interval.
+ rc->arf_active_best_quality_adjustment_factor =
+ LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR +
+ (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) *
+ (rc->frames_to_key - gop_coding_frames) /
+ (VPXMAX(1, ((rc->frames_to_key + rc->frames_since_key) / 2 -
+ gop_coding_frames)));
+ rc->arf_increase_active_best_quality = 1;
+ } else if ((rc->frames_to_key - gop_coding_frames) > 0) {
+ // Reduce the active best quality in the first half of key frame interval.
+ rc->arf_active_best_quality_adjustment_factor =
+ LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR +
+ (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) *
+ (rc->frames_since_key + gop_coding_frames) /
+ (VPXMAX(1, (rc->frames_to_key + rc->frames_since_key) / 2 +
+ gop_coding_frames));
+ rc->arf_increase_active_best_quality = -1;
+ }
+ }
+
+#ifdef AGGRESSIVE_VBR
+ // Limit maximum boost based on interval length.
+ rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 140);
+#else
+ rc->gfu_boost = VPXMIN((int)rc->gfu_boost, gop_coding_frames * 200);
+#endif
+
+ // Cap the ARF boost when perceptual quality AQ mode is enabled. This is
+ // designed to improve the perceptual quality of high value content and to
+ // make consistent quality across consecutive frames. It will hurt objective
+ // quality.
+ if (oxcf->aq_mode == PERCEPTUAL_AQ)
+ rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST);
+
+ rc->baseline_gf_interval = gop_coding_frames - rc->source_alt_ref_pending;
+
+ if (rc->source_alt_ref_pending)
+ is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval);
+
+ {
+ const double av_err = get_distribution_av_err(cpi, twopass);
+ const double mean_mod_score = twopass->mean_mod_score;
+ // If the first frame is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ int start_idx = arf_active_or_kf ? 1 : 0;
+ int j;
+ for (j = start_idx; j < gop_coding_frames; ++j) {
+ int show_idx = gf_start_show_idx + j;
+ const FIRSTPASS_STATS *frame_stats =
+ fps_get_frame_stats(first_pass_info, show_idx);
+ // Accumulate error score of frames in this gf group.
+ gf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats,
+ mean_mod_score, av_err);
+ gf_group_raw_error += frame_stats->coded_error;
+ gf_group_noise += frame_stats->frame_noise_energy;
+ gf_group_skip_pct += frame_stats->intra_skip_pct;
+ gf_group_inactive_zone_rows += frame_stats->inactive_zone_rows;
+ gf_group_inter += frame_stats->pcnt_inter;
+ gf_group_motion += frame_stats->pcnt_motion;
+ }
+ }
+
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+ gop_frames =
+ rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
+
+ // Store the average moise level measured for the group
+ // TODO(any): Experiment with removal of else condition (gop_frames = 0) so
+ // that consumption of group noise energy is based on previous gf group
+ if (gop_frames > 0)
+ twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames);
+ else
+ twopass->gf_group.group_noise_energy = 0;
+
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more aggressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames);
+ const double group_av_err = gf_group_raw_error / gop_frames;
+ const double group_av_noise = gf_group_noise / gop_frames;
+ const double group_av_skip_pct = gf_group_skip_pct / gop_frames;
+ const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) /
+ (gop_frames * (double)cm->mb_rows));
+ int tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ group_av_noise, vbr_group_bits_per_frame);
+ twopass->active_worst_quality =
+ (int)((tmp_q + (twopass->active_worst_quality *
+ (twopass->active_wq_factor - 1))) /
+ twopass->active_wq_factor);
+
+#if CONFIG_ALWAYS_ADJUST_BPM
+ // Reset rolling actual and target bits counters for ARF groups.
+ twopass->rolling_arf_group_target_bits = 0;
+ twopass->rolling_arf_group_actual_bits = 0;
+#endif
+ }
+
+ // Context Adjustment of ARNR filter strength
+ if (rc->baseline_gf_interval > 1) {
+ adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames),
+ (gf_group_inter / gop_frames),
+ (gf_group_motion / gop_frames));
+ } else {
+ twopass->arnr_strength_adjustment = 0;
+ }
+
+ // Calculate the extra bits to be used for boosted frame(s)
+ gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1),
+ rc->gfu_boost, gf_group_bits);
+
+ // Adjust KF group bits and error remaining.
+ twopass->kf_group_error_left -= gf_group_err;
+
+ // Decide GOP structure.
+ define_gf_group_structure(cpi);
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to starting GF groups at normal frame size.
+ cpi->rc.next_frame_size_selector = UNSCALED;
+ }
+#if !CONFIG_ALWAYS_ADJUST_BPM
+ // Reset rolling actual and target bits counters for ARF groups.
+ twopass->rolling_arf_group_target_bits = 0;
+ twopass->rolling_arf_group_actual_bits = 0;
+#endif
+ rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld;
+ rc->preserve_next_arf_as_gld = 0;
+ // If alt ref frame is flash do not set preserve_arf_as_gld
+ if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc &&
+ cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash)
+ rc->preserve_next_arf_as_gld = 1;
+}
+
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+ (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+ (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// This test looks for anomalous changes in the nature of the intra signal
+// related to the previous and next frame as an indicator for coding a key
+// frame. This test serves to detect some additional scene cuts,
+// especially in lowish motion and low contrast sections, that are missed
+// by the other tests.
+static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ double last_ii_ratio;
+ double this_ii_ratio;
+ double next_ii_ratio;
+ double last_pcnt_intra = 1.0 - last_frame->pcnt_inter;
+ double this_pcnt_intra = 1.0 - this_frame->pcnt_inter;
+ double next_pcnt_intra = 1.0 - next_frame->pcnt_inter;
+ double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral;
+
+ // Calculate ii ratio for this frame last frame and next frame.
+ last_ii_ratio =
+ last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error);
+ this_ii_ratio =
+ this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ next_ii_ratio =
+ next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
+
+ // Return true the intra/inter ratio for the current frame is
+ // low but better in the next and previous frame and the relative useage of
+ // intra in the current frame is markedly higher than the last and next frame.
+ if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
+ (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
+ (this_pcnt_intra > (3 * next_pcnt_intra)) &&
+ ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) {
+ return 1;
+ // Very low inter intra ratio (i.e. not much gain from inter coding), most
+ // blocks neutral on coding method and better inter prediction either side
+ } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) &&
+ (this_ii_ratio < last_ii_ratio * 0.9) &&
+ (this_ii_ratio < next_ii_ratio * 0.9)) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Threshold for use of the lagging second reference frame. Scene cuts do not
+// usually have a high second ref useage.
+#define SECOND_REF_USEAGE_THRESH 0.2
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+#define KF_II_MAX 128.0
+#define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
+
+static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
+ int show_idx) {
+ const FIRSTPASS_STATS *last_frame =
+ fps_get_frame_stats(first_pass_info, show_idx - 1);
+ const FIRSTPASS_STATS *this_frame =
+ fps_get_frame_stats(first_pass_info, show_idx);
+ const FIRSTPASS_STATS *next_frame =
+ fps_get_frame_stats(first_pass_info, show_idx + 1);
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ detect_flash_from_frame_stats(next_frame);
+ if (!detect_flash_from_frame_stats(this_frame) &&
+ !detect_flash_from_frame_stats(next_frame) &&
+ (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ (slide_transition(this_frame, last_frame, next_frame)) ||
+ (intra_step_transition(this_frame, last_frame, next_frame)) ||
+ (((this_frame->coded_error > (next_frame->coded_error * 1.2)) &&
+ (this_frame->coded_error > (last_frame->coded_error * 1.2))) &&
+ (pcnt_intra > MIN_INTRA_LEVEL) &&
+ ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+ KF_II_ERR_THRESHOLD)))) {
+ int i;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 0; i < 16; ++i) {
+ const FIRSTPASS_STATS *frame_stats =
+ fps_get_frame_stats(first_pass_info, show_idx + 1 + i);
+ double next_iiratio = (II_FACTOR * frame_stats->intra_error /
+ DOUBLE_DIVIDE_CHECK(frame_stats->coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (frame_stats->pcnt_inter > 0.85)
+ decay_accumulator *= frame_stats->pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + frame_stats->pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ if ((frame_stats->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((frame_stats->pcnt_inter - frame_stats->pcnt_neutral) < 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (frame_stats->intra_error < V_LOW_INTRA)) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (show_idx + 1 + i == fps_get_num_frames(first_pass_info) - 1) break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > 3)) {
+ is_viable_kf = 1;
+ } else {
+ is_viable_kf = 0;
+ }
+ }
+
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define MIN_KF_TOT_BOOST 300
+#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32
+#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48
+#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
+#define KF_ABS_ZOOM_THRESH 6.0
+
+int vp9_get_frames_to_next_key(const VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass, int kf_show_idx,
+ int min_gf_interval) {
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+ int j;
+ int frames_to_key;
+ int max_frames_to_key = first_pass_info->num_frames - kf_show_idx;
+ max_frames_to_key = VPXMIN(max_frames_to_key, oxcf->key_freq);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+ // Find the next keyframe.
+ if (!oxcf->auto_key) {
+ frames_to_key = max_frames_to_key;
+ } else {
+ frames_to_key = 1;
+ while (frames_to_key < max_frames_to_key) {
+ // Provided that we are not at the end of the file...
+ if (kf_show_idx + frames_to_key + 1 < first_pass_info->num_frames) {
+ double loop_decay_rate;
+ double decay_accumulator;
+ const FIRSTPASS_STATS *next_frame = fps_get_frame_stats(
+ first_pass_info, kf_show_idx + frames_to_key + 1);
+
+ // Check for a scene cut.
+ if (test_candidate_kf(first_pass_info, kf_show_idx + frames_to_key))
+ break;
+
+ // How fast is the prediction quality decaying?
+ loop_decay_rate = get_prediction_decay_rate(twopass, next_frame);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[(frames_to_key - 1) % FRAMES_TO_CHECK_DECAY] =
+ loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if ((frames_to_key - 1) > min_gf_interval && loop_decay_rate >= 0.999 &&
+ decay_accumulator < 0.9) {
+ int still_interval = oxcf->key_freq - (frames_to_key - 1);
+ // TODO(angiebird): Figure out why we use "+1" here
+ int show_idx = kf_show_idx + frames_to_key;
+ if (check_transition_to_still(first_pass_info, show_idx,
+ still_interval)) {
+ break;
+ }
+ }
+ }
+ ++frames_to_key;
+ }
+ }
+ return frames_to_key;
+}
+
+static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
+ int i;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ const FRAME_INFO *frame_info = &cpi->frame_info;
+ const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+ const FIRSTPASS_STATS *keyframe_stats =
+ fps_get_frame_stats(first_pass_info, kf_show_idx);
+ FIRSTPASS_STATS next_frame;
+ int kf_bits = 0;
+ int64_t max_kf_bits;
+ double zero_motion_accumulator = 1.0;
+ double zero_motion_sum = 0.0;
+ double zero_motion_avg;
+ double motion_compensable_sum = 0.0;
+ double motion_compensable_avg;
+ int num_frames = 0;
+ int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST;
+ double boost_score = 0.0;
+ double kf_mod_err = 0.0;
+ double kf_raw_err = 0.0;
+ double kf_group_err = 0.0;
+ double sr_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ const double av_err = get_distribution_av_err(cpi, twopass);
+ const double mean_mod_score = twopass->mean_mod_score;
+ vp9_zero(next_frame);
+
+ cpi->common.frame_type = KEY_FRAME;
+ rc->frames_since_key = 0;
+
+ // Reset the GF group data structures.
+ vp9_zero(*gf_group);
+
+ // Is this a forced key frame by interval.
+ rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ rc->frames_to_key = 1;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0.0; // Group modified error score.
+
+ kf_raw_err = keyframe_stats->intra_error;
+ kf_mod_err = calc_norm_frame_score(oxcf, frame_info, keyframe_stats,
+ mean_mod_score, av_err);
+
+ rc->frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, kf_show_idx,
+ rc->min_gf_interval);
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (rc->frames_to_key >= cpi->oxcf.key_freq) {
+ rc->next_key_frame_forced = 1;
+ } else {
+ rc->next_key_frame_forced = 0;
+ }
+
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ const FIRSTPASS_STATS *frame_stats =
+ fps_get_frame_stats(first_pass_info, kf_show_idx + i);
+ // Accumulate kf group error.
+ kf_group_err += calc_norm_frame_score(oxcf, frame_info, frame_stats,
+ mean_mod_score, av_err);
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits = (int64_t)(
+ twopass->bits_left * (kf_group_err / twopass->normalized_score_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = VPXMAX(0, twopass->kf_group_bits);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ boost_score = 0.0;
+
+ for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1));
+ ++i) {
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion;
+ motion_compensable_sum +=
+ 1 - (double)next_frame.coded_error / next_frame.intra_error;
+ num_frames++;
+ }
+
+ if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) {
+ zero_motion_avg = zero_motion_sum / num_frames;
+ motion_compensable_avg = motion_compensable_sum / num_frames;
+ kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16,
+ 160 * motion_compensable_avg - 112));
+ kf_boost_scan_frames =
+ VPXMAX(VPXMIN(kf_boost_scan_frames, MAX_SCAN_FRAMES_FOR_KF_BOOST),
+ MIN_SCAN_FRAMES_FOR_KF_BOOST);
+ }
+ reset_fpf_position(twopass, start_position);
+
+ for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // The zero motion test here insures that if we mark a kf group as static
+ // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
+ // It also allows for a larger boost on long static groups.
+ if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) {
+ double frame_boost;
+ double zm_factor;
+
+ // Monitor for static sections.
+ // First frame in kf group the second ref indicator is invalid.
+ if (i > 0) {
+ zero_motion_accumulator =
+ VPXMIN(zero_motion_accumulator,
+ get_zero_motion_factor(twopass, &next_frame));
+ } else {
+ zero_motion_accumulator =
+ next_frame.pcnt_inter - next_frame.pcnt_motion;
+ }
+
+ // Factor 0.75-1.25 based on how much of frame is static.
+ zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
+
+ // The second (lagging) ref error is not valid immediately after
+ // a key frame because either the lag has not built up (in the case of
+ // the first key frame or it points to a refernce before the new key
+ // frame.
+ if (i < 2) sr_accumulator = 0.0;
+ frame_boost =
+ calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, zm_factor);
+
+ boost_score += frame_boost;
+
+ // Measure of zoom. Large zoom tends to indicate reduced boost.
+ abs_mv_in_out_accumulator +=
+ fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+ if ((frame_boost < 25.00) ||
+ (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) ||
+ (sr_accumulator > (kf_raw_err * 1.50)))
+ break;
+ } else {
+ break;
+ }
+ }
+
+ reset_fpf_position(twopass, start_position);
+
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_in_end, rc->frames_to_key);
+
+ // Special case for static / slide show content but dont apply
+ // if the kf group is very short.
+ if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+ rc->kf_boost = (int)(twopass->kf_max_total_boost);
+ } else {
+ // Apply various clamps for min and max oost
+ rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+ rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+ rc->kf_boost = VPXMIN(rc->kf_boost, (int)(twopass->kf_max_total_boost));
+ }
+
+ // Work out how many bits to allocate for the key frame itself.
+ kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+ twopass->kf_group_bits);
+ // Based on the spatial complexity, increase the bits allocated to key frame.
+ kf_bits +=
+ (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err));
+ max_kf_bits =
+ twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS;
+ max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX);
+ kf_bits = VPXMIN(kf_bits, (int)max_kf_bits);
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+ gf_group->rf_level[0] = KF_STD;
+ gf_group->layer_depth[0] = 0;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->normalized_score_left -= kf_group_err;
+
+ if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+ // Default to normal-sized frame on keyframes.
+ cpi->rc.next_frame_size_selector = UNSCALED;
+ }
+}
+
+// Configure image size specific vizier parameters.
+// Later these will be set via additional command line options
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area) {
+ // When |use_vizier_rc_params| is 1, we expect the rc parameters below to
+ // have been initialised on the command line as adjustment factors such
+ // that a factor of 1.0 will match the default behavior when
+ // |use_vizier_rc_params| is 0
+ if (twopass->use_vizier_rc_params) {
+ twopass->active_wq_factor *= AV_WQ_FACTOR;
+ twopass->err_per_mb *= BASELINE_ERR_PER_MB;
+ twopass->sr_default_decay_limit *= DEFAULT_DECAY_LIMIT;
+ if (twopass->sr_default_decay_limit > 1.0) // > 1.0 here makes no sense
+ twopass->sr_default_decay_limit = 1.0;
+ twopass->sr_diff_factor *= 1.0;
+ twopass->gf_frame_max_boost *= GF_MAX_FRAME_BOOST;
+ twopass->gf_max_total_boost *= MAX_GF_BOOST;
+ // NOTE: In use max boost has precedence over min boost. So even if min is
+ // somehow set higher than max the final boost value will be clamped to the
+ // appropriate maximum.
+ twopass->kf_frame_min_boost *= KF_MIN_FRAME_BOOST;
+ twopass->kf_frame_max_boost_first *= KF_MAX_FRAME_BOOST;
+ twopass->kf_frame_max_boost_subs *= KF_MAX_FRAME_BOOST;
+ twopass->kf_max_total_boost *= MAX_KF_TOT_BOOST;
+ twopass->zm_factor *= DEFAULT_ZM_FACTOR;
+ if (twopass->zm_factor > 1.0) // > 1.0 here makes no sense
+ twopass->zm_factor = 1.0;
+
+ // Correction for the fact that the kf_err_per_mb_factor default is
+ // already different for different video formats and ensures that a passed
+ // in value of 1.0 on the vizier command line will still match the current
+ // default.
+ if (screen_area < 1280 * 720) {
+ twopass->kf_err_per_mb *= 2000.0;
+ } else if (screen_area < 1920 * 1080) {
+ twopass->kf_err_per_mb *= 500.0;
+ } else {
+ twopass->kf_err_per_mb *= 250.0;
+ }
+ } else {
+ // When |use_vizier_rc_params| is 0, use defaults.
+ twopass->active_wq_factor = AV_WQ_FACTOR;
+ twopass->err_per_mb = BASELINE_ERR_PER_MB;
+ twopass->sr_default_decay_limit = DEFAULT_DECAY_LIMIT;
+ twopass->sr_diff_factor = 1.0;
+ twopass->gf_frame_max_boost = GF_MAX_FRAME_BOOST;
+ twopass->gf_max_total_boost = MAX_GF_BOOST;
+ twopass->kf_frame_min_boost = KF_MIN_FRAME_BOOST;
+ twopass->kf_frame_max_boost_first = KF_MAX_FRAME_BOOST;
+ twopass->kf_frame_max_boost_subs = KF_MAX_FRAME_BOOST;
+ twopass->kf_max_total_boost = MAX_KF_TOT_BOOST;
+ twopass->zm_factor = DEFAULT_ZM_FACTOR;
+
+ if (screen_area < 1280 * 720) {
+ twopass->kf_err_per_mb = 2000.0;
+ } else if (screen_area < 1920 * 1080) {
+ twopass->kf_err_per_mb = 500.0;
+ } else {
+ twopass->kf_err_per_mb = 250.0;
+ }
+ }
+}
+
+void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ FIRSTPASS_STATS this_frame;
+ const int show_idx = cm->current_video_frame;
+
+ if (cpi->common.current_frame_coding_index == 0) {
+ const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+ &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+ if (codec_status != VPX_CODEC_OK) {
+ vpx_internal_error(&cm->error, codec_status,
+ "vp9_extrc_send_firstpass_stats() failed");
+ }
+ }
+
+ if (!twopass->stats_in) return;
+
+ // Configure image size specific vizier parameters
+ if (cm->current_video_frame == 0) {
+ unsigned int screen_area = (cm->width * cm->height);
+
+ vp9_init_vizier_params(twopass, screen_area);
+ }
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ int target_rate;
+
+ vp9_zero(this_frame);
+ this_frame =
+ cpi->twopass.stats_in_start[cm->current_video_frame +
+ gf_group->arf_src_offset[gf_group->index]];
+
+ vp9_configure_buffer_updates(cpi, gf_group->index);
+
+ target_rate = gf_group->bit_allocation[gf_group->index];
+ target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+ rc->base_frame_target = target_rate;
+
+ cm->frame_type = INTER_FRAME;
+
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+ twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
+
+ return;
+ }
+
+ vpx_clear_system_state();
+
+ if (cpi->oxcf.rc_mode == VPX_Q) {
+ twopass->active_worst_quality = cpi->oxcf.cq_level;
+ } else if (cm->current_video_frame == 0) {
+ const int frames_left =
+ (int)(twopass->total_stats.count - cm->current_video_frame);
+ // Special case code for first frame.
+ const int section_target_bandwidth =
+ (int)(twopass->bits_left / frames_left);
+ const double section_length = twopass->total_left_stats.count;
+ const double section_error =
+ twopass->total_left_stats.coded_error / section_length;
+ const double section_intra_skip =
+ twopass->total_left_stats.intra_skip_pct / section_length;
+ const double section_inactive_zone =
+ (twopass->total_left_stats.inactive_zone_rows * 2) /
+ ((double)cm->mb_rows * section_length);
+ const double section_noise =
+ twopass->total_left_stats.frame_noise_energy / section_length;
+ int tmp_q;
+
+ tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_noise, section_target_bandwidth);
+
+ twopass->active_worst_quality = tmp_q;
+ twopass->baseline_active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ rc->last_q[INTER_FRAME] = tmp_q;
+ rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
+ rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+ rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+ }
+ vp9_zero(this_frame);
+ if (EOF == input_stats(twopass, &this_frame)) return;
+
+ // Set the frame content type flag.
+ if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass->fr_content_type = FC_NORMAL;
+
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, show_idx);
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, show_idx);
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+ cm->current_video_frame, rc->frames_till_gf_update_due,
+ rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+
+ vp9_configure_buffer_updates(cpi, gf_group->index);
+
+ rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
+
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+ twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
+
+ // Update the total stats remaining structure.
+ subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+void vp9_twopass_postencode_update(VP9_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ VP9_COMMON *const cm = &cpi->common;
+ const int bits_used = rc->base_frame_target;
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = VPXMAX(twopass->bits_left - bits_used, 0);
+
+ // Target vs actual bits for this arf group.
+ twopass->rolling_arf_group_target_bits += rc->this_frame_target;
+ twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
+
+ // Calculate the pct rc error.
+ if (rc->total_actual_bits) {
+ rc->rate_error_estimate =
+ (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+ rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+ } else {
+ rc->rate_error_estimate = 0;
+ }
+
+ if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= bits_used;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = VPXMAX(twopass->kf_group_bits, 0);
+
+ // Increment the gf group index ready for the next frame.
+ ++twopass->gf_group.index;
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((cpi->oxcf.rc_mode != VPX_Q) && !cpi->rc.is_src_frame_alt_ref) {
+ const int maxq_adj_limit =
+ rc->worst_quality - twopass->active_worst_quality;
+ const int minq_adj_limit =
+ (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+ int aq_extend_min = 0;
+ int aq_extend_max = 0;
+
+ // Extend min or Max Q range to account for imbalance from the base
+ // value when using AQ.
+ if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ &&
+ cpi->oxcf.aq_mode != PERCEPTUAL_AQ) {
+ if (cm->seg.aq_av_offset < 0) {
+ // The balance of the AQ map tends towarda lowering the average Q.
+ aq_extend_min = 0;
+ aq_extend_max = VPXMIN(maxq_adj_limit, -cm->seg.aq_av_offset);
+ } else {
+ // The balance of the AQ map tends towards raising the average Q.
+ aq_extend_min = VPXMIN(minq_adj_limit, cm->seg.aq_av_offset);
+ aq_extend_max = 0;
+ }
+ }
+
+ // Undershoot.
+ if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+ --twopass->extend_maxq;
+ if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+ ++twopass->extend_minq;
+ // Overshoot.
+ } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+ --twopass->extend_minq;
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ ++twopass->extend_maxq;
+ } else {
+ // Adjustment for extreme local overshoot.
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+
+ // Unwind undershoot or overshoot adjustment.
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ --twopass->extend_minq;
+ else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+
+ twopass->extend_minq =
+ clamp(twopass->extend_minq, aq_extend_min, minq_adj_limit);
+ twopass->extend_maxq =
+ clamp(twopass->extend_maxq, aq_extend_max, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ rc->vbr_bits_off_target_fast =
+ VPXMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+ // Fast adaptation of minQ if necessary to use up the extra bits.
+ if (rc->avg_frame_bandwidth) {
+ twopass->extend_minq_fast =
+ (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+ }
+ twopass->extend_minq_fast = VPXMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else if (rc->vbr_bits_off_target_fast) {
+ twopass->extend_minq_fast = VPXMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else {
+ twopass->extend_minq_fast = 0;
+ }
+ }
+ }
+}
+
+#if CONFIG_RATE_CTRL
+void vp9_get_next_group_of_picture(const VP9_COMP *cpi, int *first_is_key_frame,
+ int *use_alt_ref, int *coding_frame_count,
+ int *first_show_idx,
+ int *last_gop_use_alt_ref) {
+ const GOP_COMMAND *gop_command = &cpi->encode_command.gop_command;
+ // We make a copy of rc here because we want to get information from the
+ // encoder without changing its state.
+ // TODO(angiebird): Avoid copying rc here.
+ RATE_CONTROL rc = cpi->rc;
+ const int multi_layer_arf = 0;
+ const int allow_alt_ref = 1;
+ // We assume that current_video_frame is updated to the show index of the
+ // frame we are about to called. Note that current_video_frame is updated at
+ // the end of encode_frame_to_data_rate().
+ // TODO(angiebird): Avoid this kind of fragile style.
+ *first_show_idx = cpi->common.current_video_frame;
+ *last_gop_use_alt_ref = rc.source_alt_ref_active;
+
+ *first_is_key_frame = 0;
+ if (rc.frames_to_key == 0) {
+ rc.frames_to_key = vp9_get_frames_to_next_key(
+ &cpi->oxcf, &cpi->twopass, *first_show_idx, rc.min_gf_interval);
+ rc.frames_since_key = 0;
+ *first_is_key_frame = 1;
+ }
+
+ if (gop_command->use) {
+ *coding_frame_count = gop_command_coding_frame_count(gop_command);
+ *use_alt_ref = gop_command->use_alt_ref;
+ assert(gop_command->show_frame_count <= rc.frames_to_key);
+ } else {
+ *coding_frame_count = vp9_get_gop_coding_frame_count(
+ &cpi->oxcf, &cpi->twopass, &cpi->frame_info, &rc, *first_show_idx,
+ multi_layer_arf, allow_alt_ref, *first_is_key_frame,
+ *last_gop_use_alt_ref, use_alt_ref);
+ }
+}
+
+int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass,
+ const FRAME_INFO *frame_info,
+ const RATE_CONTROL *rc, int show_idx,
+ int multi_layer_arf, int allow_alt_ref,
+ int first_is_key_frame,
+ int last_gop_use_alt_ref, int *use_alt_ref) {
+ int frame_count;
+ double gop_intra_factor;
+ const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
+ RANGE active_gf_interval;
+ int arf_layers;
+ if (oxcf->use_simple_encode_api) {
+ active_gf_interval = get_active_gf_inverval_range_simple(
+ rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
+ } else {
+ active_gf_interval = get_active_gf_inverval_range(
+ frame_info, rc, arf_active_or_kf, show_idx, /*active_worst_quality=*/0,
+ /*last_boosted_qindex=*/0);
+ }
+
+ arf_layers = get_arf_layers(multi_layer_arf, oxcf->enable_auto_arf,
+ active_gf_interval.max);
+ if (multi_layer_arf) {
+ gop_intra_factor = 1.0 + 0.25 * arf_layers;
+ } else {
+ gop_intra_factor = 1.0;
+ }
+
+ frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
+ show_idx, &active_gf_interval,
+ gop_intra_factor, oxcf->lag_in_frames);
+ *use_alt_ref &= allow_alt_ref;
+ return frame_count;
+}
+
+// Under CONFIG_RATE_CTRL, once the first_pass_info is ready, the number of
+// coding frames (including show frame and alt ref) can be determined.
+int vp9_get_coding_frame_num(const VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass,
+ const FRAME_INFO *frame_info, int multi_layer_arf,
+ int allow_alt_ref) {
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ int coding_frame_num = 0;
+ RATE_CONTROL rc;
+ int gop_coding_frame_count;
+ int gop_show_frames;
+ int show_idx = 0;
+ int last_gop_use_alt_ref = 0;
+ vp9_rc_init(oxcf, 1, &rc);
+
+ while (show_idx < first_pass_info->num_frames) {
+ int use_alt_ref;
+ int first_is_key_frame = 0;
+ if (rc.frames_to_key == 0) {
+ rc.frames_to_key = vp9_get_frames_to_next_key(oxcf, twopass, show_idx,
+ rc.min_gf_interval);
+ rc.frames_since_key = 0;
+ first_is_key_frame = 1;
+ }
+
+ gop_coding_frame_count = vp9_get_gop_coding_frame_count(
+ oxcf, twopass, frame_info, &rc, show_idx, multi_layer_arf,
+ allow_alt_ref, first_is_key_frame, last_gop_use_alt_ref, &use_alt_ref);
+
+ rc.source_alt_ref_active = use_alt_ref;
+ last_gop_use_alt_ref = use_alt_ref;
+ gop_show_frames = gop_coding_frame_count - use_alt_ref;
+ rc.frames_to_key -= gop_show_frames;
+ rc.frames_since_key += gop_show_frames;
+ show_idx += gop_show_frames;
+ coding_frame_num += gop_show_frames + use_alt_ref;
+ }
+ return coding_frame_num;
+}
+
+void vp9_get_key_frame_map(const VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass, int *key_frame_map) {
+ const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
+ int show_idx = 0;
+ RATE_CONTROL rc;
+ vp9_rc_init(oxcf, 1, &rc);
+
+ // key_frame_map points to an int array with size equal to
+ // first_pass_info->num_frames, which is also the number of show frames in the
+ // video.
+ memset(key_frame_map, 0,
+ sizeof(*key_frame_map) * first_pass_info->num_frames);
+ while (show_idx < first_pass_info->num_frames) {
+ int key_frame_group_size;
+ key_frame_map[show_idx] = 1;
+ key_frame_group_size =
+ vp9_get_frames_to_next_key(oxcf, twopass, show_idx, rc.min_gf_interval);
+ assert(key_frame_group_size > 0);
+ show_idx += key_frame_group_size;
+ }
+ assert(show_idx == first_pass_info->num_frames);
+}
+#endif // CONFIG_RATE_CTRL
+
+FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass) {
+ return twopass->this_frame_stats;
+}
+FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass) {
+ return twopass->total_stats;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h
new file mode 100644
index 0000000000..cdcf568723
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
+
+#include <assert.h>
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_lookahead.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_ROW (-1)
+
+#define MAX_ARF_LAYERS 6
+#define SECTION_NOISE_DEF 250.0
+
+typedef struct {
+ double frame_mb_intra_factor;
+ double frame_mb_brightness_factor;
+ double frame_mb_neutral_count;
+} FP_MB_FLOAT_STATS;
+
+typedef struct {
+ double intra_factor;
+ double brightness_factor;
+ int64_t coded_error;
+ int64_t sr_coded_error;
+ int64_t frame_noise_energy;
+ int64_t intra_error;
+ int intercount;
+ int second_ref_count;
+ double neutral_count;
+ double intra_count_low; // Coded intra but low variance
+ double intra_count_high; // Coded intra high variance
+ int intra_skip_count;
+ int image_data_start_row;
+ int mvcount;
+ int sum_mvr;
+ int sum_mvr_abs;
+ int sum_mvc;
+ int sum_mvc_abs;
+ int64_t sum_mvrs;
+ int64_t sum_mvcs;
+ int sum_in_vectors;
+ int intra_smooth_count;
+} FIRSTPASS_DATA;
+
+typedef struct {
+ double frame;
+ double weight;
+ double intra_error;
+ double coded_error;
+ double sr_coded_error;
+ double frame_noise_energy;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double pcnt_intra_low; // Coded intra but low variance
+ double pcnt_intra_high; // Coded intra high variance
+ double intra_skip_pct;
+ double intra_smooth_pct; // % of blocks that are smooth
+ double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double duration;
+ double count;
+ int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+typedef enum {
+ KF_UPDATE = 0,
+ LF_UPDATE = 1,
+ GF_UPDATE = 2,
+ ARF_UPDATE = 3,
+ OVERLAY_UPDATE = 4,
+ MID_OVERLAY_UPDATE = 5,
+ USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update
+ FRAME_UPDATE_TYPES = 7
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+ FC_NORMAL = 0,
+ FC_GRAPHICS_ANIMATION = 1,
+ FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+ unsigned char index;
+ RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2];
+ int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2];
+
+ int frame_start;
+ int frame_end;
+ // TODO(jingning): The array size of arf_stack could be reduced.
+ int arf_index_stack[MAX_LAG_BUFFERS * 2];
+ int top_arf_idx;
+ int stack_size;
+ int gf_group_size;
+ int max_layer_depth;
+ int allowed_max_layer_depth;
+ int group_noise_energy;
+} GF_GROUP;
+
+typedef struct {
+ const FIRSTPASS_STATS *stats;
+ int num_frames;
+} FIRST_PASS_INFO;
+
+static INLINE void fps_init_first_pass_info(FIRST_PASS_INFO *first_pass_info,
+ const FIRSTPASS_STATS *stats,
+ int num_frames) {
+ first_pass_info->stats = stats;
+ first_pass_info->num_frames = num_frames;
+}
+
+static INLINE int fps_get_num_frames(const FIRST_PASS_INFO *first_pass_info) {
+ return first_pass_info->num_frames;
+}
+
+static INLINE const FIRSTPASS_STATS *fps_get_frame_stats(
+ const FIRST_PASS_INFO *first_pass_info, int show_idx) {
+ if (show_idx < 0 || show_idx >= first_pass_info->num_frames) {
+ return NULL;
+ }
+ return &first_pass_info->stats[show_idx];
+}
+
+typedef struct {
+ unsigned int section_intra_rating;
+ unsigned int key_frame_section_intra_rating;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
+ const FIRSTPASS_STATS *stats_in;
+ const FIRSTPASS_STATS *stats_in_start;
+ const FIRSTPASS_STATS *stats_in_end;
+ FIRST_PASS_INFO first_pass_info;
+ FIRSTPASS_STATS total_left_stats;
+ int first_pass_done;
+ int64_t bits_left;
+ double mean_mod_score;
+ double normalized_score_left;
+ double mb_av_energy;
+ double mb_smooth_pct;
+
+ FP_MB_FLOAT_STATS *fp_mb_float_stats;
+
+ // An indication of the content type of the current frame
+ FRAME_CONTENT_TYPE fr_content_type;
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ double kf_group_error_left;
+
+ double bpm_factor;
+ int rolling_arf_group_target_bits;
+ int rolling_arf_group_actual_bits;
+
+ int sr_update_lag;
+ int kf_zeromotion_pct;
+ int last_kfgroup_zeromotion_pct;
+ int active_worst_quality;
+ int baseline_active_worst_quality;
+ int extend_minq;
+ int extend_maxq;
+ int extend_minq_fast;
+ int arnr_strength_adjustment;
+ int last_qindex_of_arf_layer[MAX_ARF_LAYERS];
+
+ GF_GROUP gf_group;
+
+ // Vizeir project experimental two pass rate control parameters.
+ // When |use_vizier_rc_params| is 1, the following parameters will
+ // be overwritten by pass in values. Otherwise, they are initialized
+ // by default values.
+ int use_vizier_rc_params;
+ double active_wq_factor;
+ double err_per_mb;
+ double sr_default_decay_limit;
+ double sr_diff_factor;
+ double kf_err_per_mb;
+ double kf_frame_min_boost;
+ double kf_frame_max_boost_first; // Max for first kf in a chunk.
+ double kf_frame_max_boost_subs; // Max for subsequent mid chunk kfs.
+ double kf_max_total_boost;
+ double gf_max_total_boost;
+ double gf_frame_max_boost;
+ double zm_factor;
+} TWO_PASS;
+
+struct VP9_COMP;
+struct ThreadData;
+struct TileDataEnc;
+
+void vp9_init_first_pass(struct VP9_COMP *cpi);
+void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
+void vp9_end_first_pass(struct VP9_COMP *cpi);
+
+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
+ struct ThreadData *td,
+ FIRSTPASS_DATA *fp_acc_data,
+ struct TileDataEnc *tile_data,
+ MV *best_ref_mv, int mb_row);
+
+void vp9_init_second_pass(struct VP9_COMP *cpi);
+void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
+void vp9_init_vizier_params(TWO_PASS *const twopass, int screen_area);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
+ int *scaled_frame_height);
+
+struct VP9EncoderConfig;
+int vp9_get_frames_to_next_key(const struct VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass, int kf_show_idx,
+ int min_gf_interval);
+#if CONFIG_RATE_CTRL
+/* Call this function to get info about the next group of pictures.
+ * This function should be called after vp9_create_compressor() when encoding
+ * starts or after vp9_get_compressed_data() when the encoding process of
+ * the last group of pictures is just finished.
+ */
+void vp9_get_next_group_of_picture(const struct VP9_COMP *cpi,
+ int *first_is_key_frame, int *use_alt_ref,
+ int *coding_frame_count, int *first_show_idx,
+ int *last_gop_use_alt_ref);
+
+/*!\brief Call this function before coding a new group of pictures to get
+ * information about it.
+ * \param[in] oxcf Encoder config
+ * \param[in] twopass Twopass info
+ * \param[in] frame_info Frame info
+ * \param[in] rc Rate control state
+ * \param[in] show_idx Show index of the first frame in the group
+ * \param[in] multi_layer_arf Is multi-layer alternate reference used
+ * \param[in] allow_alt_ref Is alternate reference allowed
+ * \param[in] first_is_key_frame Is the first frame in the group a key frame
+ * \param[in] last_gop_use_alt_ref Does the last group use alternate reference
+ *
+ * \param[out] use_alt_ref Does this group use alternate reference
+ *
+ * \return Returns coding frame count
+ */
+int vp9_get_gop_coding_frame_count(const struct VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass,
+ const FRAME_INFO *frame_info,
+ const RATE_CONTROL *rc, int show_idx,
+ int multi_layer_arf, int allow_alt_ref,
+ int first_is_key_frame,
+ int last_gop_use_alt_ref, int *use_alt_ref);
+
+int vp9_get_coding_frame_num(const struct VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass,
+ const FRAME_INFO *frame_info, int multi_layer_arf,
+ int allow_alt_ref);
+
+/*!\brief Compute a key frame binary map indicates whether key frames appear
+ * in the corresponding positions. The passed in key_frame_map must point to an
+ * integer array with length equal to twopass->first_pass_info.num_frames,
+ * which is the number of show frames in the video.
+ */
+void vp9_get_key_frame_map(const struct VP9EncoderConfig *oxcf,
+ const TWO_PASS *const twopass, int *key_frame_map);
+#endif // CONFIG_RATE_CTRL
+
+FIRSTPASS_STATS vp9_get_frame_stats(const TWO_PASS *twopass);
+FIRSTPASS_STATS vp9_get_total_stats(const TWO_PASS *twopass);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c
new file mode 100644
index 0000000000..a410d0407f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ INTERP_FILTER filter_type, int phase_scaler) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
+ src->v_buffer };
+ const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
+ uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
+ const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
+ const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
+ int x, y, i;
+
+#if HAVE_SSSE3 || HAVE_NEON
+ // TODO(linfengz): The 4:3 specialized C code is disabled by default since
+ // it's much slower than the general version which calls vpx_scaled_2d() even
+ // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
+ // for the platforms which have faster optimization.
+ if (4 * dst->y_crop_width == 3 * src_w &&
+ 4 * dst->y_crop_height == 3 * src_h) {
+ // Specialize 4 to 3 scaling.
+ // Example pixel locations.
+ // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.)
+ // phase_scaler = 0 | phase_scaler = 8
+ // |
+ // X O S O S O X | O O O O O
+ // |
+ // |
+ // | S S S
+ // |
+ // |
+ // O O O O O | O O O O O
+ // |
+ // S S S S |
+ // |
+ // |
+ // | S S S
+ // O O O O O | O O O O O
+ // |
+ // |
+ // |
+ // S S S S |
+ // |
+ // O O O O O | O O O O O
+ // | S S S
+ // |
+ // |
+ // |
+ // |
+ // X O S O S O X | O O O O O
+
+ const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width,
+ dst->uv_crop_width };
+ const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height,
+ dst->uv_crop_height };
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int dst_w = dst_ws[i];
+ const int dst_h = dst_hs[i];
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (y = 0; y < dst_h; y += 3) {
+ for (x = 0; x < dst_w; x += 3) {
+ const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3;
+ uint8_t *dst_ptr = dsts[i] + y * dst_stride + x;
+
+ // Must call c function because its optimization doesn't support 3x3.
+ vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3);
+ }
+ }
+ }
+ } else
+#endif
+ {
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ const int factor = (i == 0 || i == 3 ? 1 : 2);
+ const int src_stride = src_strides[i];
+ const int dst_stride = dst_strides[i];
+ for (y = 0; y < dst_h; y += 16) {
+ const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+ for (x = 0; x < dst_w; x += 16) {
+ const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+ const uint8_t *src_ptr = srcs[i] +
+ (y / factor) * src_h / dst_h * src_stride +
+ (x / factor) * src_w / dst_w;
+ uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+ vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+ x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+ 16 * src_h / dst_h, 16 / factor, 16 / factor);
+ }
+ }
+ }
+ }
+
+ vpx_extend_frame_borders(dst);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h
new file mode 100644
index 0000000000..ad09c11198
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_job_queue.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
+
+typedef enum {
+ FIRST_PASS_JOB,
+ ENCODE_JOB,
+ ARNR_JOB,
+ NUM_JOB_TYPES,
+} JOB_TYPE;
+
+// Encode job parameters
+typedef struct {
+ int vert_unit_row_num; // Index of the vertical unit row
+ int tile_col_id; // tile col id within a tile
+ int tile_row_id; // tile col id within a tile
+} JobNode;
+
+// Job queue element parameters
+typedef struct {
+ // Pointer to the next link in the job queue
+ void *next;
+
+ // Job information context of the module
+ JobNode job_info;
+} JobQueue;
+
+// Job queue handle
+typedef struct {
+ // Pointer to the next link in the job queue
+ void *next;
+
+ // Counter to store the number of jobs picked up for processing
+ int num_jobs_acquired;
+} JobQueueHandle;
+
+#endif // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
new file mode 100644
index 0000000000..97838c38e6
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+ int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz) index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++) vpx_free_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int subsampling_x,
+ unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int depth) {
+ struct lookahead_ctx *ctx = NULL;
+
+ // Clamp the lookahead queue depth
+ depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+ // Allocate memory to keep previous source frames available.
+ depth += MAX_PRE_FRAMES;
+
+ // Allocate the lookahead structures
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ const int legacy_byte_alignment = 0;
+ unsigned int i;
+ ctx->max_sz = depth;
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ ctx->next_show_idx = 0;
+ if (!ctx->buf) goto bail;
+ for (i = 0; i < depth; i++)
+ if (vpx_alloc_frame_buffer(
+ &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, legacy_byte_alignment))
+ goto bail;
+ }
+ return ctx;
+bail:
+ vp9_lookahead_destroy(ctx);
+ return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+int vp9_lookahead_full(const struct lookahead_ctx *ctx) {
+ return ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz;
+}
+
+int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx) {
+ return ctx->next_show_idx;
+}
+
+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ vpx_enc_frame_flags_t flags) {
+ struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+ int row, col, active_end;
+ int mb_rows = (src->y_height + 15) >> 4;
+ int mb_cols = (src->y_width + 15) >> 4;
+#endif
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
+#if !CONFIG_VP9_HIGHBITDEPTH
+ (void)use_highbitdepth;
+ assert(use_highbitdepth == 0);
+#endif
+
+ if (vp9_lookahead_full(ctx)) return 1;
+ ctx->sz++;
+ buf = pop(ctx, &ctx->write_idx);
+
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+ // TODO(jkoleszar): This is disabled for now, as
+ // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+ // Only do this partial copy if the following conditions are all met:
+ // 1. Lookahead queue has has size of 1.
+ // 2. Active map is provided.
+ // 3. This is not a key frame, golden nor altref frame.
+ if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+ for (row = 0; row < mb_rows; ++row) {
+ col = 0;
+
+ while (1) {
+ // Find the first active macroblock in this row.
+ for (; col < mb_cols; ++col) {
+ if (active_map[col]) break;
+ }
+
+ // No more active macroblock in this row.
+ if (col == mb_cols) break;
+
+ // Find the end of active region in this row.
+ active_end = col;
+
+ for (; active_end < mb_cols; ++active_end) {
+ if (!active_map[active_end]) break;
+ }
+
+ // Only copy this active region.
+ vp9_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
+ 16, (active_end - col) << 4);
+
+ // Start again from the end of this active region.
+ col = active_end;
+ }
+
+ active_map += mb_cols;
+ }
+ } else {
+#endif
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (vpx_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, 0))
+ return 1;
+ vpx_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
+ vp9_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+ }
+#endif
+
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->flags = flags;
+ buf->show_idx = ctx->next_show_idx;
+ ++ctx->next_show_idx;
+ return 0;
+}
+
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain) {
+ struct lookahead_entry *buf = NULL;
+
+ if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+ buf = pop(ctx, &ctx->read_idx);
+ ctx->sz--;
+ }
+ return buf;
+}
+
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+ int index) {
+ struct lookahead_entry *buf = NULL;
+
+ if (index >= 0) {
+ // Forward peek
+ if (index < ctx->sz) {
+ index += ctx->read_idx;
+ if (index >= ctx->max_sz) index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ } else if (index < 0) {
+ // Backward peek
+ if (-index <= MAX_PRE_FRAMES) {
+ index += ctx->read_idx;
+ if (index < 0) index += ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ }
+
+ return buf;
+}
+
+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h
new file mode 100644
index 0000000000..6ac6736673
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_lookahead.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
+#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
+
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ int show_idx; /*The show_idx of this frame*/
+ vpx_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+ int max_sz; /* Absolute size of the queue */
+ int sz; /* Number of buffers currently in the queue */
+ int read_idx; /* Read index */
+ int write_idx; /* Write index */
+ int next_show_idx; /* The show_idx that will be assigned to the next frame
+ being pushed in the queue*/
+ struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
+ unsigned int height,
+ unsigned int subsampling_x,
+ unsigned int subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth,
+#endif
+ unsigned int depth);
+
+/**\brief Destroys the lookahead stage
+ */
+void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Check if lookahead is full
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ *
+ * Return 1 if lookahead is full, otherwise return 0.
+ */
+int vp9_lookahead_full(const struct lookahead_ctx *ctx);
+
+/**\brief Return the next_show_idx
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ *
+ * Return the show_idx that will be assigned to the next
+ * frame pushed by vp9_lookahead_push()
+ */
+int vp9_lookahead_next_show_idx(const struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] flags Flags set on this frame
+ */
+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ vpx_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,
+ int index);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ */
+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c
new file mode 100644
index 0000000000..fafc673aca
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+
+static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
+ MV *dst_mv, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const SEARCH_METHODS old_search_method = mv_sf->search_method;
+ const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ MV ref_full;
+ int cost_list[5];
+
+ // Further step/diamond searches as necessary
+ int step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, ref_mv);
+
+ ref_full.col = ref_mv->col >> 3;
+ ref_full.row = ref_mv->row >> 3;
+
+ mv_sf->search_method = HEX;
+ vp9_full_pixel_search(cpi, x, BLOCK_16X16, &ref_full, step_param,
+ cpi->sf.mv.search_method, x->errorperbit,
+ cond_cost_list(cpi, cost_list), ref_mv, dst_mv, 0, 0);
+ mv_sf->search_method = old_search_method;
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ // Try sub-pixel MC
+ // if (bestsme > error_thresh && bestsme < INT_MAX)
+ {
+ uint32_t distortion;
+ uint32_t sse;
+ // TODO(yunqing): may use higher tap interp filter than 2 taps if needed.
+ cpi->find_fractional_mv_step(
+ x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &v_fn_ptr, 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
+ 0, USE_2_TAPS);
+ }
+
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = *dst_mv;
+
+ vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
+
+ return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
+ int_mv *dst_mv, int mb_row, int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err, tmp_err;
+ MV tmp_mv;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+ dst_mv->as_int = 0;
+
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search
+ tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ err = tmp_err;
+ dst_mv->as_mv = tmp_mv;
+ }
+
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
+ if (ref_mv->row != 0 || ref_mv->col != 0) {
+ MV zero_ref_mv = { 0, 0 };
+
+ tmp_err =
+ do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ dst_mv->as_mv = tmp_mv;
+ err = tmp_err;
+ }
+ }
+
+ return err;
+}
+
+static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+ dst_mv->as_int = 0;
+
+ return err;
+}
+static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ PREDICTION_MODE best_mode = -1, mode;
+ unsigned int best_err = INT_MAX;
+
+ // calculate SATD for each intra prediction mode;
+ // we're intentionally not doing 4x4, we just want a rough estimate
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ unsigned int err;
+
+ xd->mi[0]->mode = mode;
+ vp9_predict_intra_block(xd, 2, TX_16X16, mode, x->plane[0].src.buf,
+ x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, 0, 0, 0);
+ err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+ // find best
+ if (err < best_err) {
+ best_err = err;
+ best_mode = mode;
+ }
+ }
+
+ if (pbest_mode) *pbest_mode = best_mode;
+
+ return best_err;
+}
+
+static void update_mbgraph_mb_stats(VP9_COMP *cpi, MBGRAPH_MB_STATS *stats,
+ YV12_BUFFER_CONFIG *buf, int mb_y_offset,
+ YV12_BUFFER_CONFIG *golden_ref,
+ const MV *prev_golden_ref_mv,
+ YV12_BUFFER_CONFIG *alt_ref, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int intra_error;
+ VP9_COMMON *cm = &cpi->common;
+
+ // FIXME in practice we're completely ignoring chroma here
+ x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+ x->plane[0].src.stride = buf->y_stride;
+
+ xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+ xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+ // do intra 16x16 prediction
+ intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
+ if (intra_error <= 0) intra_error = 1;
+ stats->ref[INTRA_FRAME].err = intra_error;
+
+ // Golden frame MV search, if it exists and is different than last frame
+ if (golden_ref) {
+ int g_motion_error;
+ xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = golden_ref->y_stride;
+ g_motion_error =
+ do_16x16_motion_search(cpi, prev_golden_ref_mv,
+ &stats->ref[GOLDEN_FRAME].m.mv, mb_row, mb_col);
+ stats->ref[GOLDEN_FRAME].err = g_motion_error;
+ } else {
+ stats->ref[GOLDEN_FRAME].err = INT_MAX;
+ stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+ }
+
+ // Do an Alt-ref frame MV search, if it exists and is different than
+ // last/golden frame.
+ if (alt_ref) {
+ int a_motion_error;
+ xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = alt_ref->y_stride;
+ a_motion_error =
+ do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
+
+ stats->ref[ALTREF_FRAME].err = a_motion_error;
+ } else {
+ stats->ref[ALTREF_FRAME].err = INT_MAX;
+ stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+ }
+}
+
+static void update_mbgraph_frame_stats(VP9_COMP *cpi,
+ MBGRAPH_FRAME_STATS *stats,
+ YV12_BUFFER_CONFIG *buf,
+ YV12_BUFFER_CONFIG *golden_ref,
+ YV12_BUFFER_CONFIG *alt_ref) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON *const cm = &cpi->common;
+
+ int mb_col, mb_row, offset = 0;
+ int mb_y_offset = 0;
+ MV gld_top_mv = { 0, 0 };
+ MODE_INFO mi_local;
+ MODE_INFO mi_above, mi_left;
+
+ vp9_zero(mi_local);
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+ // Signal to vp9_predict_intra_block() that above is not available
+ xd->above_mi = NULL;
+
+ xd->plane[0].dst.stride = buf->y_stride;
+ xd->plane[0].pre[0].stride = buf->y_stride;
+ xd->plane[1].dst.stride = buf->uv_stride;
+ xd->mi[0] = &mi_local;
+ mi_local.sb_type = BLOCK_16X16;
+ mi_local.ref_frame[0] = LAST_FRAME;
+ mi_local.ref_frame[1] = NONE;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ MV gld_left_mv = gld_top_mv;
+ int mb_y_in_offset = mb_y_offset;
+
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+ // Signal to vp9_predict_intra_block() that left is not available
+ xd->left_mi = NULL;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+ update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
+ &gld_left_mv, alt_ref, mb_row, mb_col);
+ gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+ if (mb_col == 0) {
+ gld_top_mv = gld_left_mv;
+ }
+ // Signal to vp9_predict_intra_block() that left is available
+ xd->left_mi = &mi_left;
+
+ mb_y_in_offset += 16;
+ x->mv_limits.col_min -= 16;
+ x->mv_limits.col_max -= 16;
+ }
+
+ // Signal to vp9_predict_intra_block() that above is available
+ xd->above_mi = &mi_above;
+
+ mb_y_offset += buf->y_stride * 16;
+ x->mv_limits.row_min -= 16;
+ x->mv_limits.row_max -= 16;
+ offset += cm->mb_cols;
+ }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mb_col, mb_row, offset, i;
+ int mi_row, mi_col;
+ int ncnt[4] = { 0 };
+ int n_frames = cpi->mbgraph_n_frames;
+
+ int *arf_not_zz;
+
+ CHECK_MEM_ERROR(
+ &cm->error, arf_not_zz,
+ vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+ // We are not interested in results beyond the alt ref itself.
+ if (n_frames > cpi->rc.frames_till_gf_update_due)
+ n_frames = cpi->rc.frames_till_gf_update_due;
+
+ // defer cost to reference frames
+ for (i = n_frames - 1; i >= 0; i--) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+ for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+ offset += cm->mb_cols, mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+ int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+ int intra_err = mb_stats->ref[INTRA_FRAME].err;
+ int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+ // Test for altref vs intra and gf and that its mv was 0,0.
+ if (altref_err > 1000 || altref_err > intra_err ||
+ altref_err > golden_err) {
+ arf_not_zz[offset + mb_col]++;
+ }
+ }
+ }
+ }
+
+ // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+ // of bound access in segmentation_map
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ // If any of the blocks in the sequence failed then the MB
+ // goes in segment 0
+ if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+ ncnt[0]++;
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+ } else {
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+ ncnt[1]++;
+ }
+ }
+ }
+
+ // Only bother with segmentation if over 10% of the MBs in static segment
+ // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+ if (1) {
+ // Note % of blocks that are marked as static
+ if (cm->MBs)
+ cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+ // This error case should not be reachable as this function should
+ // never be called with the common data structure uninitialized.
+ else
+ cpi->static_mb_pct = 0;
+
+ vp9_enable_segmentation(&cm->seg);
+ } else {
+ cpi->static_mb_pct = 0;
+ vp9_disable_segmentation(&cm->seg);
+ }
+
+ // Free localy allocated storage
+ vpx_free(arf_not_zz);
+}
+
+void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
+ YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+ assert(golden_ref != NULL);
+
+ // we need to look ahead beyond where the ARF transitions into
+ // being a GF - so exit if we don't look ahead beyond that
+ if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
+
+ if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
+
+ cpi->mbgraph_n_frames = n_frames;
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ memset(frame_stats->mb_stats, 0,
+ cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ }
+
+ // do motion search to find contribution of each reference to data
+ // later on in this GF group
+ // FIXME really, the GF/last MC search should be done forward, and
+ // the ARF MC search backwards, to get optimal results for MV caching
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ struct lookahead_entry *q_cur = vp9_lookahead_peek(cpi->lookahead, i);
+
+ assert(q_cur != NULL);
+
+ update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
+ cpi->Source);
+ }
+
+ vpx_clear_system_state();
+
+ separate_arf_mbs(cpi);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h
new file mode 100644
index 0000000000..7b629861d5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mbgraph.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_
+#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ struct {
+ int err;
+ union {
+ int_mv mv;
+ PREDICTION_MODE mode;
+ } m;
+ } ref[MAX_REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+ MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct VP9_COMP;
+
+void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c
new file mode 100644
index 0000000000..64e9ef0f91
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c
@@ -0,0 +1,3035 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_reconinter.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
+ int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+ int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+ int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+ col_min = VPXMAX(col_min, (MV_LOW >> 3) + 1);
+ row_min = VPXMAX(row_min, (MV_LOW >> 3) + 1);
+ col_max = VPXMIN(col_max, (MV_UPP >> 3) - 1);
+ row_max = VPXMIN(row_max, (MV_UPP >> 3) - 1);
+
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
+ if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+ if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+ if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+ if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+ const MvLimits *umv_window_limits,
+ const MV *ref_mv) {
+ subpel_mv_limits->col_min = VPXMAX(umv_window_limits->col_min * 8,
+ ref_mv->col - MAX_FULL_PEL_VAL * 8);
+ subpel_mv_limits->col_max = VPXMIN(umv_window_limits->col_max * 8,
+ ref_mv->col + MAX_FULL_PEL_VAL * 8);
+ subpel_mv_limits->row_min = VPXMAX(umv_window_limits->row_min * 8,
+ ref_mv->row - MAX_FULL_PEL_VAL * 8);
+ subpel_mv_limits->row_max = VPXMIN(umv_window_limits->row_max * 8,
+ ref_mv->row + MAX_FULL_PEL_VAL * 8);
+
+ subpel_mv_limits->col_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->col_min);
+ subpel_mv_limits->col_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->col_max);
+ subpel_mv_limits->row_min = VPXMAX(MV_LOW + 1, subpel_mv_limits->row_min);
+ subpel_mv_limits->row_max = VPXMIN(MV_UPP - 1, subpel_mv_limits->row_max);
+}
+
+int vp9_init_search_range(int size) {
+ int sr = 0;
+ // Minimum search size no matter what the passed in value.
+ size = VPXMAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+ sr = VPXMIN(sr, MAX_MVSEARCH_STEPS - 2);
+ return sr;
+}
+
+int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int error_per_bit) {
+ if (mvcost) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return (int)ROUND64_POWER_OF_TWO(
+ (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+ RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
+ PIXEL_TRANSFORM_ERROR_SCALE);
+ }
+ return 0;
+}
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+ int len;
+ int ss_count = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 4 search sites per step.
+ const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+ int i;
+ for (i = 0; i < 4; ++i, ++ss_count) {
+ cfg->ss_mv[ss_count] = ss_mvs[i];
+ cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
+ }
+ }
+
+ cfg->searches_per_step = 4;
+ cfg->total_steps = ss_count / cfg->searches_per_step;
+}
+
+void vp9_init3smotion_compensation(search_site_config *cfg, int stride) {
+ int len;
+ int ss_count = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 8 search sites per step.
+ const MV ss_mvs[8] = { { -len, 0 }, { len, 0 }, { 0, -len },
+ { 0, len }, { -len, -len }, { -len, len },
+ { len, -len }, { len, len } };
+ int i;
+ for (i = 0; i < 8; ++i, ++ss_count) {
+ cfg->ss_mv[ss_count] = ss_mvs[i];
+ cfg->ss_os[ss_count] = ss_mvs[i].row * stride + ss_mvs[i].col;
+ }
+ }
+
+ cfg->searches_per_step = 8;
+ cfg->total_steps = ss_count / cfg->searches_per_step;
+}
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) { return x & 7; }
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+ return &buf[(r >> 3) * stride + (c >> 3)];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+ do { \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ int64_t tmpmse; \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ if (second_pred == NULL) { \
+ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+ src_stride, &sse); \
+ } else { \
+ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+ src_stride, &sse, second_pred); \
+ } \
+ tmpmse = thismse; \
+ tmpmse += \
+ mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \
+ if (tmpmse >= INT_MAX) { \
+ v = INT_MAX; \
+ } else if ((v = (uint32_t)tmpmse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ } \
+ } while (0)
+#else
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+ do { \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ if (second_pred == NULL) \
+ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+ src_stride, &sse); \
+ else \
+ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+ src_stride, &sse, second_pred); \
+ if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \
+ error_per_bit) + \
+ thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ } \
+ } while (0)
+
+#endif
+#define FIRST_LEVEL_CHECKS \
+ do { \
+ unsigned int left, right, up, down, diag; \
+ CHECK_BETTER(left, tr, tc - hstep); \
+ CHECK_BETTER(right, tr, tc + hstep); \
+ CHECK_BETTER(up, tr - hstep, tc); \
+ CHECK_BETTER(down, tr + hstep, tc); \
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); \
+ switch (whichdir) { \
+ case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
+ case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
+ case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
+ case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
+ } \
+ } while (0)
+
+#define SECOND_LEVEL_CHECKS \
+ do { \
+ int kr, kc; \
+ unsigned int second; \
+ if (tr != br && tc != bc) { \
+ kr = br - tr; \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + kr, tc + 2 * kc); \
+ CHECK_BETTER(second, tr + 2 * kr, tc + kc); \
+ } else if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \
+ CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \
+ switch (whichdir) { \
+ case 0: \
+ case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
+ case 2: \
+ case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
+ } \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \
+ CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \
+ switch (whichdir) { \
+ case 0: \
+ case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
+ case 1: \
+ case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
+ } \
+ } \
+ } while (0)
+
+#define SETUP_SUBPEL_SEARCH \
+ const uint8_t *const z = x->plane[0].src.buf; \
+ const int src_stride = x->plane[0].src.stride; \
+ const MACROBLOCKD *xd = &x->e_mbd; \
+ unsigned int besterr = UINT_MAX; \
+ unsigned int sse; \
+ unsigned int whichdir; \
+ int thismse; \
+ const unsigned int halfiters = iters_per_step; \
+ const unsigned int quarteriters = iters_per_step; \
+ const unsigned int eighthiters = iters_per_step; \
+ const int y_stride = xd->plane[0].pre[0].stride; \
+ const int offset = bestmv->row * y_stride + bestmv->col; \
+ const uint8_t *const y = xd->plane[0].pre[0].buf; \
+ \
+ int rr = ref_mv->row; \
+ int rc = ref_mv->col; \
+ int br = bestmv->row * 8; \
+ int bc = bestmv->col * 8; \
+ int hstep = 4; \
+ int minc, maxc, minr, maxr; \
+ int tr = br; \
+ int tc = bc; \
+ MvLimits subpel_mv_limits; \
+ \
+ vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); \
+ minc = subpel_mv_limits.col_min; \
+ maxc = subpel_mv_limits.col_max; \
+ minr = subpel_mv_limits.row_min; \
+ maxr = subpel_mv_limits.row_max; \
+ \
+ bestmv->row *= 8; \
+ bestmv->col *= 8
+
+static unsigned int setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride, const uint8_t *const y,
+ int y_stride, const uint8_t *second_pred, int w, int h, int offset,
+ int *mvjcost, int *mvcost[2], uint32_t *sse1, uint32_t *distortion) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint64_t besterr;
+ if (second_pred != NULL) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+ vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+ h, CONVERT_TO_SHORTPTR(y + offset), y_stride);
+ besterr =
+ vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = (uint32_t)besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ if (besterr >= UINT_MAX) return UINT_MAX;
+ return (uint32_t)besterr;
+#else
+ uint32_t besterr;
+ (void)xd;
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+ return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+ cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
+ const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+ const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+ const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+ const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+ const int b = 1 << (bits - 1);
+ *ic = (int)divide_and_round(x0 * b, y0);
+ *ir = (int)divide_and_round(x1 * b, y1);
+}
+
+uint32_t vp9_skip_sub_pixel_tree(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, second_pred, w, h,
+ offset, mvjcost, mvcost, sse1, distortion);
+ (void)halfiters;
+ (void)quarteriters;
+ (void)eighthiters;
+ (void)whichdir;
+ (void)allow_hp;
+ (void)forced_stop;
+ (void)hstep;
+ (void)rr;
+ (void)rc;
+ (void)minr;
+ (void)minc;
+ (void)maxr;
+ (void)maxc;
+ (void)tr;
+ (void)tc;
+ (void)sse;
+ (void)thismse;
+ (void)cost_list;
+ (void)use_accurate_subpel_search;
+
+ return besterr;
+}
+
+uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, second_pred, w, h,
+ offset, mvjcost, mvcost, sse1, distortion);
+ (void)halfiters;
+ (void)quarteriters;
+ (void)eighthiters;
+ (void)whichdir;
+ (void)allow_hp;
+ (void)forced_stop;
+ (void)hstep;
+ (void)use_accurate_subpel_search;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ int ir, ic;
+ unsigned int minpt = INT_MAX;
+ get_cost_surf_min(cost_list, &ir, &ic, 2);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+uint32_t vp9_find_best_sub_pixel_tree_pruned_more(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, second_pred, w, h,
+ offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ unsigned int minpt;
+ int ir, ic;
+ get_cost_surf_min(cost_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+uint32_t vp9_find_best_sub_pixel_tree_pruned(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, second_pred, w, h,
+ offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX) {
+ unsigned int left, right, up, down, diag;
+ whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+ (cost_list[2] < cost_list[4] ? 0 : 2);
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && use_mv_hp(ref_mv) && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+ // left, right, up, down
+ { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+ { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+ { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+};
+/* clang-format on */
+
+static int accurate_sub_pel_search(
+ const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf,
+ const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp,
+ const uint8_t *const src_address, const int src_stride,
+ const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred,
+ int w, int h, uint32_t *sse) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint64_t besterr;
+ assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+ assert(w != 0 && h != 0);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride,
+ pred16, w, this_mv, sf, w, h, 0, kernel,
+ MV_PRECISION_Q3, 0, 0, xd->bd);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+ vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+ h, pred16, w);
+ besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address,
+ src_stride, sse);
+ } else {
+ besterr =
+ vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse);
+ }
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+ vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+ 0, kernel, MV_PRECISION_Q3, 0, 0);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+ besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+ } else {
+ besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+ }
+ }
+ if (besterr >= UINT_MAX) return UINT_MAX;
+ return (int)besterr;
+#else
+ int besterr;
+ DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+ assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+ assert(w != 0 && h != 0);
+ (void)xd;
+
+ vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+ 0, kernel, MV_PRECISION_Q3, 0, 0);
+ if (second_pred != NULL) {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+ vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+ besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+ } else {
+ besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+ }
+ return besterr;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// TODO(yunqing): this part can be further refactored.
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ do { \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ int64_t tmpmse; \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+ src_stride, y, y_stride, second_pred, \
+ w, h, &sse); \
+ tmpmse = thismse; \
+ tmpmse += \
+ mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \
+ if (tmpmse >= INT_MAX) { \
+ v = INT_MAX; \
+ } else if ((v = (uint32_t)tmpmse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ } \
+ } while (0)
+#else
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ do { \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ const MV cb_mv = { r, c }; \
+ const MV cb_ref_mv = { rr, rc }; \
+ thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
+ src_stride, y, y_stride, second_pred, \
+ w, h, &sse); \
+ if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \
+ error_per_bit) + \
+ thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ } \
+ } while (0)
+
+#endif
+
+uint32_t vp9_find_best_sub_pixel_tree(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ const uint8_t *const z = x->plane[0].src.buf;
+ const uint8_t *const src_address = z;
+ const int src_stride = x->plane[0].src.stride;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int besterr = UINT_MAX;
+ unsigned int sse;
+ int thismse;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter, round = 3 - forced_stop;
+
+ int minc, maxc, minr, maxr;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ MvLimits subpel_mv_limits;
+
+ // TODO(yunqing): need to add 4-tap filter optimization to speed up the
+ // encoder.
+ const InterpKernel *kernel =
+ (use_accurate_subpel_search > 0)
+ ? ((use_accurate_subpel_search == USE_4_TAPS)
+ ? vp9_filter_kernels[FOURTAP]
+ : ((use_accurate_subpel_search == USE_8_TAPS)
+ ? vp9_filter_kernels[EIGHTTAP]
+ : vp9_filter_kernels[EIGHTTAP_SHARP]))
+ : vp9_filter_kernels[BILINEAR];
+
+ vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+ minc = subpel_mv_limits.col_min;
+ maxc = subpel_mv_limits.col_max;
+ minr = subpel_mv_limits.row_min;
+ maxr = subpel_mv_limits.row_max;
+
+ if (!(allow_hp && use_mv_hp(ref_mv)))
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
+ src_stride, y, y_stride, second_pred, w, h,
+ offset, mvjcost, mvcost, sse1, distortion);
+
+ (void)cost_list; // to silence compiler warning
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv;
+ this_mv.row = tr;
+ this_mv.col = tc;
+
+ if (use_accurate_subpel_search) {
+ thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+ src_address, src_stride, y,
+ y_stride, second_pred, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address =
+ y + (tr >> 3) * y_stride + (tc >> 3);
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = UINT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+ if (use_accurate_subpel_search) {
+ thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, w, h, &sse);
+ } else {
+ const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+ if (second_pred == NULL)
+ thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+ src_stride, &sse);
+ else
+ thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+ src_address, src_stride, &sse, second_pred);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = UINT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 0 && best_idx != -1) {
+ unsigned int second;
+ const int br0 = br;
+ const int bc0 = bc;
+ assert(tr == br || tc == bc);
+
+ if (tr == br && tc != bc) {
+ kc = bc - tc;
+ if (iters_per_step == 1) {
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0, bc0 + kc);
+ } else {
+ CHECK_BETTER(second, br0, bc0 + kc);
+ }
+ }
+ } else if (tr != br && tc == bc) {
+ kr = br - tr;
+ if (iters_per_step == 1) {
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0 + kr, bc0);
+ } else {
+ CHECK_BETTER(second, br0 + kr, bc0);
+ }
+ }
+ }
+
+ if (iters_per_step > 1) {
+ if (use_accurate_subpel_search) {
+ CHECK_BETTER1(second, br0 + kr, bc0);
+ CHECK_BETTER1(second, br0, bc0 + kc);
+ if (br0 != br || bc0 != bc) {
+ CHECK_BETTER1(second, br0 + kr, bc0 + kc);
+ }
+ } else {
+ CHECK_BETTER(second, br0 + kr, bc0);
+ CHECK_BETTER(second, br0, bc0 + kc);
+ if (br0 != br || bc0 != bc) {
+ CHECK_BETTER(second, br0 + kr, bc0 + kc);
+ }
+ }
+ }
+ }
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+#undef CHECK_BETTER
+#undef CHECK_BETTER1
+
+static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+ int range) {
+ return ((row - range) >= mv_limits->row_min) &
+ ((row + range) <= mv_limits->row_max) &
+ ((col - range) >= mv_limits->col_min) &
+ ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
+ (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+}
+
+#define CHECK_BETTER \
+ { \
+ if (thissad < bestsad) { \
+ if (use_mvcost) \
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
+ if (thissad < bestsad) { \
+ bestsad = thissad; \
+ best_site = i; \
+ } \
+ } \
+ }
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
+ int sadpb,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv, int *cost_list) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int br = best_mv->row;
+ int bc = best_mv->col;
+ const MV mv = { br, bc };
+ int i;
+ unsigned int sse;
+
+ cost_list[0] =
+ fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride, &sse) +
+ mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride, &sse) +
+ mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost,
+ x->mvcost, x->errorperbit);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride, &sse) +
+ mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost,
+ x->mvcost, x->errorperbit);
+ }
+ }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int vp9_pattern_search(
+ const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit,
+ int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv, MV *best_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ int br, bc;
+ int bestsad = INT_MAX;
+ int thissad;
+ int k = -1;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ int best_init_s = search_param_to_steps[search_param];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ br = ref_mv->row;
+ bc = ref_mv->col;
+
+ // Work out the start point for the search
+ bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ int best_site = -1;
+ s = best_init_s;
+
+ do {
+ // No need to search all 6 points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ } while (s--);
+ }
+
+ best_mv->row = br;
+ best_mv->col = bc;
+
+ // Returns the one-away integer pel sad values around the best as follows:
+ // cost_list[0]: cost at the best integer pel
+ // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
+ }
+ return bestsad;
+}
+
+// A specialized function where the smallest scale search candidates
+// are 4 1-away neighbors, and cost_list is non-null
+// TODO(debargha): Merge this function with the one above. Also remove
+// use_mvcost option since it is always 1, to save unnecessary branches.
+static int vp9_pattern_search_sad(
+ const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit,
+ int do_init_search, int *cost_list, const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv, MV *best_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ int br, bc;
+ int bestsad = INT_MAX;
+ int thissad;
+ int k = -1;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ int best_init_s = search_param_to_steps[search_param];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ br = ref_mv->row;
+ bc = ref_mv->col;
+ if (cost_list != NULL) {
+ cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+ INT_MAX;
+ }
+
+ // Work out the start point for the search
+ bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= do_sad; s--) {
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ }
+
+ // Note: If we enter the if below, then cost_list must be non-NULL.
+ if (s == 0) {
+ cost_list[0] = bestsad;
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+ cost_list[((k + 2) % 4) + 1] = cost_list[0];
+ cost_list[0] = bestsad;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) {
+ cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+ continue;
+ }
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ }
+ }
+ }
+
+ // Returns the one-away integer pel sad values around the best as follows:
+ // cost_list[0]: sad at the best integer pel
+ // cost_list[1]: sad at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: sad at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: sad at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ if (cost_list[0] == INT_MAX) {
+ cost_list[0] = bestsad;
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ }
+ } else {
+ if (use_mvcost) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (cost_list[i + 1] != INT_MAX) {
+ cost_list[i + 1] +=
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ }
+ }
+ }
+ }
+ }
+ best_mv->row = br;
+ best_mv->col = bc;
+ return bestsad;
+}
+
+int vp9_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ uint32_t unused;
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint64_t err =
+ vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, &unused);
+ err += (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+ if (err >= INT_MAX) return INT_MAX;
+ return (int)err;
+#else
+ return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+#endif
+}
+
+int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+ what->buf, what->stride, &unused, second_pred) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+static int hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv, MV *best_mv) {
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6 };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+ { -1, 0 } },
+ { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+ { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+ { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+ { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+ { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+ { -32, 0 } },
+ { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+ { -64, 0 } },
+ { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+ { -128, 0 } },
+ { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+ { -256, 0 } },
+ { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+ { -512, 0 } },
+ { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+ { -512, 1024 }, { -1024, 0 } }
+ };
+ /* clang-format on */
+ return vp9_pattern_search(
+ x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
+ use_mvcost, center_mv, best_mv, hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv, MV *best_mv) {
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+ { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+ { -1, 1 }, { -2, 0 } },
+ { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+ { -2, 2 }, { -4, 0 } },
+ { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+ { -4, 4 }, { -8, 0 } },
+ { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+ { -8, 8 }, { -16, 0 } },
+ { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+ { 0, 32 }, { -16, 16 }, { -32, 0 } },
+ { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+ { 0, 64 }, { -32, 32 }, { -64, 0 } },
+ { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+ { 0, 128 }, { -64, 64 }, { -128, 0 } },
+ { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+ { 0, 256 }, { -128, 128 }, { -256, 0 } },
+ { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+ { 0, 512 }, { -256, 256 }, { -512, 0 } },
+ { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+ { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } }
+ };
+ /* clang-format on */
+ return vp9_pattern_search_sad(
+ x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
+ use_mvcost, center_mv, best_mv, bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv, MV *best_mv) {
+ // All scales have 8 closest points in square shape
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+ { -2, 2 }, { -2, 0 } },
+ { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+ { -4, 4 }, { -4, 0 } },
+ { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+ { -8, 8 }, { -8, 0 } },
+ { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+ { 0, 16 }, { -16, 16 }, { -16, 0 } },
+ { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+ { 0, 32 }, { -32, 32 }, { -32, 0 } },
+ { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+ { 0, 64 }, { -64, 64 }, { -64, 0 } },
+ { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+ { 0, 128 }, { -128, 128 }, { -128, 0 } },
+ { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+ { 0, 256 }, { -256, 256 }, { -256, 0 } },
+ { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+ { 0, 512 }, { -512, 512 }, { -512, 0 } },
+ { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+ { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } }
+ };
+ /* clang-format on */
+ return vp9_pattern_search(
+ x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
+ use_mvcost, center_mv, best_mv, square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit,
+ int do_init_search, // must be zero for fast_hex
+ int *cost_list, const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv, MV *best_mv) {
+ return hex_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv, best_mv);
+}
+
+static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv, MV *best_mv) {
+ return bigdia_search(x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv, best_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+ int range, int step, int sad_per_bit,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ MV fcenter_mv = { center_mv->row, center_mv->col };
+ unsigned int best_sad = INT_MAX;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+ int col_step = (step > 1) ? step : 4;
+
+ assert(step >= 1);
+
+ clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ *best_mv = fcenter_mv;
+ best_sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+ mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+ start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+ start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+ end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+ end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ } else {
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ addrs[i] = get_buf_from_mv(in_what, &mv);
+ }
+ fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ const unsigned int sad =
+ sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return best_sad;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+#if CONFIG_NON_GREEDY_MV
+static int64_t exhaustive_mesh_search_multi_step(
+ MV *best_mv, const MV *center_mv, int range, int step,
+ const struct buf_2d *src, const struct buf_2d *pre, int lambda,
+ const int_mv *nb_full_mvs, int full_mv_num, const MvLimits *mv_limits,
+ const vp9_variance_fn_ptr_t *fn_ptr) {
+ int64_t best_sad;
+ int r, c;
+ int start_col, end_col, start_row, end_row;
+ *best_mv = *center_mv;
+ best_sad =
+ ((int64_t)fn_ptr->sdf(src->buf, src->stride,
+ get_buf_from_mv(pre, center_mv), pre->stride)
+ << LOG2_PRECISION) +
+ lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num);
+ start_row = VPXMAX(center_mv->row - range, mv_limits->row_min);
+ start_col = VPXMAX(center_mv->col - range, mv_limits->col_min);
+ end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
+ end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += step) {
+ const MV mv = { r, c };
+ int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+ get_buf_from_mv(pre, &mv), pre->stride)
+ << LOG2_PRECISION;
+ if (sad < best_sad) {
+ sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ }
+ return best_sad;
+}
+
+static int64_t exhaustive_mesh_search_single_step(
+ MV *best_mv, const MV *center_mv, int range, const struct buf_2d *src,
+ const struct buf_2d *pre, int lambda, const int_mv *nb_full_mvs,
+ int full_mv_num, const MvLimits *mv_limits,
+ const vp9_variance_fn_ptr_t *fn_ptr) {
+ int64_t best_sad;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+
+ *best_mv = *center_mv;
+ best_sad =
+ ((int64_t)fn_ptr->sdf(src->buf, src->stride,
+ get_buf_from_mv(pre, center_mv), pre->stride)
+ << LOG2_PRECISION) +
+ lambda * vp9_nb_mvs_inconsistency(best_mv, nb_full_mvs, full_mv_num);
+ start_row = VPXMAX(center_mv->row - range, mv_limits->row_min);
+ start_col = VPXMAX(center_mv->col - range, mv_limits->col_min);
+ end_row = VPXMIN(center_mv->row + range, mv_limits->row_max);
+ end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
+ for (r = start_row; r <= end_row; r += 1) {
+ c = start_col;
+ while (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = { r, c + i };
+ addrs[i] = get_buf_from_mv(pre, &mv);
+ }
+ fn_ptr->sdx4df(src->buf, src->stride, addrs, pre->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
+ if (sad < best_sad) {
+ const MV mv = { r, c + i };
+ sad +=
+ lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ }
+ c += 4;
+ }
+ while (c <= end_col) {
+ const MV mv = { r, c };
+ int64_t sad = (int64_t)fn_ptr->sdf(src->buf, src->stride,
+ get_buf_from_mv(pre, &mv), pre->stride)
+ << LOG2_PRECISION;
+ if (sad < best_sad) {
+ sad += lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ if (sad < best_sad) {
+ best_sad = sad;
+ *best_mv = mv;
+ }
+ }
+ c += 1;
+ }
+ }
+ return best_sad;
+}
+
+static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
+ int range, int step,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int lambda,
+ const int_mv *nb_full_mvs,
+ int full_mv_num) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *src = &x->plane[0].src;
+ const struct buf_2d *pre = &xd->plane[0].pre[0];
+ assert(step >= 1);
+ assert(is_mv_in(&x->mv_limits, center_mv));
+ if (step == 1) {
+ return exhaustive_mesh_search_single_step(
+ best_mv, center_mv, range, src, pre, lambda, nb_full_mvs, full_mv_num,
+ &x->mv_limits, fn_ptr);
+ }
+ return exhaustive_mesh_search_multi_step(best_mv, center_mv, range, step, src,
+ pre, lambda, nb_full_mvs,
+ full_mv_num, &x->mv_limits, fn_ptr);
+}
+
+static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x,
+ MV *centre_mv_full,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ MV *dst_mv, int lambda,
+ const int_mv *nb_full_mvs,
+ int full_mv_num) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+ int64_t bestsme;
+ int i;
+ int interval = sf->mesh_patterns[0].interval;
+ int range = sf->mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+ (interval > range)) {
+ printf("ERROR: invalid range\n");
+ assert(0);
+ }
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+ range = VPXMIN(range, MAX_RANGE);
+ interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+ // initial search
+ bestsme =
+ exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
+ lambda, nb_full_mvs, full_mv_num);
+
+ if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhaustive_mesh_search_new(
+ x, &temp_mv, sf->mesh_patterns[i].range,
+ sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs,
+ full_mv_num);
+
+ if (sf->mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ *dst_mv = temp_mv;
+
+ return bestsme;
+}
+
+static int64_t diamond_search_sad_new(const MACROBLOCK *x,
+ const search_site_config *cfg,
+ const MV *init_full_mv, MV *best_full_mv,
+ int search_param, int lambda, int *num00,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const int_mv *nb_full_mvs,
+ int full_mv_num) {
+ int i, j, step;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *what = x->plane[0].src.buf;
+ const int what_stride = x->plane[0].src.stride;
+ const uint8_t *in_what;
+ const int in_what_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *best_address;
+
+ int64_t bestsad;
+ int best_site = -1;
+ int last_site = -1;
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+ const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+ const int tot_steps = cfg->total_steps - search_param;
+ vpx_clear_system_state();
+
+ *best_full_mv = *init_full_mv;
+ clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ *num00 = 0;
+
+ // Work out the start point for the search
+ in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride +
+ best_full_mv->col;
+ best_address = in_what;
+
+ // Check the starting position
+ {
+ const int64_t mv_dist =
+ (int64_t)fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
+ << LOG2_PRECISION;
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num);
+ bestsad = mv_dist + lambda * mv_cost;
+ }
+
+ i = 0;
+
+ for (step = 0; step < tot_steps; step++) {
+ int all_in = 1, t;
+
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
+ all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min);
+ all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max);
+ all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min);
+ all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max);
+
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
+ if (all_in) {
+ unsigned int sad_array[4];
+
+ for (j = 0; j < cfg->searches_per_step; j += 4) {
+ unsigned char const *block_offset[4];
+
+ for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
+
+ for (t = 0; t < 4; t++, i++) {
+ const int64_t mv_dist = (int64_t)sad_array[t] << LOG2_PRECISION;
+ if (mv_dist < bestsad) {
+ const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+ best_full_mv->col + ss_mv[i].col };
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num);
+ const int64_t thissad = mv_dist + lambda * mv_cost;
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ // Trap illegal vectors
+ const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+ best_full_mv->col + ss_mv[i].col };
+
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss_os[i] + best_address;
+ const int64_t mv_dist =
+ (int64_t)fn_ptr->sdf(what, what_stride, check_here,
+ in_what_stride)
+ << LOG2_PRECISION;
+ if (mv_dist < bestsad) {
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num);
+ const int64_t thissad = mv_dist + lambda * mv_cost;
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+ if (best_site != last_site) {
+ best_full_mv->row += ss_mv[best_site].row;
+ best_full_mv->col += ss_mv[best_site].col;
+ best_address += ss_os[best_site];
+ last_site = best_site;
+ } else if (best_address == in_what) {
+ (*num00)++;
+ }
+ }
+ return bestsad;
+}
+
+int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
+ int mi_col, int_mv *nb_full_mvs) {
+ const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+ const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };
+ int nb_full_mv_num = 0;
+ int i;
+ assert(mi_row % mi_height == 0);
+ assert(mi_col % mi_width == 0);
+ for (i = 0; i < NB_MVS_NUM; ++i) {
+ int r = dirs[i][0];
+ int c = dirs[i][1];
+ int brow = mi_row / mi_height + r;
+ int bcol = mi_col / mi_width + c;
+ if (brow >= 0 && brow < motion_field->block_rows && bcol >= 0 &&
+ bcol < motion_field->block_cols) {
+ if (vp9_motion_field_is_mv_set(motion_field, brow, bcol)) {
+ int_mv mv = vp9_motion_field_get_mv(motion_field, brow, bcol);
+ nb_full_mvs[nb_full_mv_num].as_mv = get_full_mv(&mv.as_mv);
+ ++nb_full_mv_num;
+ }
+ }
+ }
+ return nb_full_mv_num;
+}
+#endif // CONFIG_NON_GREEDY_MV
+
+int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
+ MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
+ const MV *center_mv) {
+ int i, j, step;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *what = x->plane[0].src.buf;
+ const int what_stride = x->plane[0].src.stride;
+ const uint8_t *in_what;
+ const int in_what_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *best_address;
+
+ unsigned int bestsad = start_mv_sad;
+ int best_site = -1;
+ int last_site = -1;
+
+ int ref_row;
+ int ref_col;
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+ const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+ const int tot_steps = cfg->total_steps - search_param;
+
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ ref_row = ref_mv->row;
+ ref_col = ref_mv->col;
+ *num00 = 0;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // Work out the start point for the search
+ in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+ best_address = in_what;
+
+ i = 0;
+
+ for (step = 0; step < tot_steps; step++) {
+ int all_in = 1, t;
+
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
+ all_in &= ((best_mv->row + ss_mv[i].row) > x->mv_limits.row_min);
+ all_in &= ((best_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max);
+ all_in &= ((best_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min);
+ all_in &= ((best_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max);
+
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
+ if (all_in) {
+ unsigned int sad_array[4];
+
+ for (j = 0; j < cfg->searches_per_step; j += 4) {
+ unsigned char const *block_offset[4];
+
+ for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
+
+ sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
+
+ for (t = 0; t < 4; t++, i++) {
+ if (sad_array[t] < bestsad) {
+ const MV this_mv = { best_mv->row + ss_mv[i].row,
+ best_mv->col + ss_mv[i].col };
+ sad_array[t] +=
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad_array[t] < bestsad) {
+ bestsad = sad_array[t];
+ best_site = i;
+ }
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ // Trap illegal vectors
+ const MV this_mv = { best_mv->row + ss_mv[i].row,
+ best_mv->col + ss_mv[i].col };
+
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss_os[i] + best_address;
+ unsigned int thissad =
+ sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+ if (best_site != last_site) {
+ best_mv->row += ss_mv[best_site].row;
+ best_mv->col += ss_mv[best_site].col;
+ best_address += ss_os[best_site];
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss_mv[best_site].row,
+ best_mv->col + ss_mv[best_site].col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss_os[best_site] + best_address;
+ unsigned int thissad =
+ fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->row += ss_mv[best_site].row;
+ best_mv->col += ss_mv[best_site].col;
+ best_address += ss_os[best_site];
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what) {
+ (*num00)++;
+ }
+ }
+ return bestsad;
+}
+
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = vpx_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = vpx_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+
+ return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+ { -1, 0 },
+ { 0, -1 },
+ { 0, 1 },
+ { 1, 0 },
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
+ DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+ DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+ DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+ DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+ int idx;
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int search_width = bw << 1;
+ const int search_height = bh << 1;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+ unsigned int best_sad, tmp_sad, this_sad[4];
+ MV this_mv;
+ const int norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+ MvLimits subpel_mv_limits;
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ // TODO(jingning): Implement integral projection functions for high bit-depth
+ // setting and remove this part of code.
+ if (xd->bd != 8) {
+ const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return sad;
+ }
+#endif
+
+ // Set up prediction 1-D reference set
+ ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+ for (idx = 0; idx < search_width; idx += 16) {
+ vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+ ref_buf += 16;
+ }
+
+ ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+ for (idx = 0; idx < search_height; ++idx) {
+ vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;
+ ref_buf += ref_stride;
+ }
+
+ // Set up src 1-D reference set
+ for (idx = 0; idx < bw; idx += 16) {
+ src_buf = x->plane[0].src.buf + idx;
+ vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+ }
+
+ src_buf = x->plane[0].src.buf;
+ for (idx = 0; idx < bh; ++idx) {
+ src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;
+ src_buf += src_stride;
+ }
+
+ // Find the best match per 1-D search
+ tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+ tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+ this_mv = *tmp_mv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+ best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ {
+ const uint8_t *const pos[4] = {
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
+ };
+
+ cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+ }
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ tmp_mv->row = search_pos[idx].row + this_mv.row;
+ tmp_mv->col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+ tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ *tmp_mv = this_mv;
+ best_sad = tmp_sad;
+ }
+
+ tmp_mv->row *= 8;
+ tmp_mv->col *= 8;
+
+ vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+ clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+ subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ return best_sad;
+}
+
+static int get_exhaustive_threshold(int exhaustive_searches_thresh,
+ BLOCK_SIZE bsize) {
+ return exhaustive_searches_thresh >>
+ (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+}
+
+#if CONFIG_NON_GREEDY_MV
+// Runs sequence of diamond searches in smaller steps for RD.
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+ point as the best match, we will do a final 1-away diamond
+ refining search */
+int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int lambda, int do_refine,
+ const int_mv *nb_full_mvs, int full_mv_num,
+ MV *best_mv) {
+ const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ int n, num00 = 0;
+ int thissme;
+ int bestsme;
+ const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param;
+ const MV center_mv = { 0, 0 };
+ vpx_clear_system_state();
+ diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param, lambda,
+ &n, fn_ptr, nb_full_mvs, full_mv_num);
+
+ bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+ if (num00) {
+ num00--;
+ } else {
+ MV temp_mv;
+ diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+ step_param + n, lambda, &num00, fn_ptr,
+ nb_full_mvs, full_mv_num);
+ thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV temp_mv = *best_mv;
+ vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range, fn_ptr,
+ nb_full_mvs, full_mv_num);
+ thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *best_mv = temp_mv;
+ }
+ }
+
+ if (sf->exhaustive_searches_thresh < INT_MAX &&
+ !cpi->rc.is_src_frame_alt_ref) {
+ const int64_t exhaustive_thr =
+ get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+ if (bestsme > exhaustive_thr) {
+ full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv, lambda,
+ nb_full_mvs, full_mv_num);
+ bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+ }
+ }
+ return bestsme;
+}
+#endif // CONFIG_NON_GREEDY_MV
+
+// Runs sequence of diamond searches in smaller steps for RD.
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+ point as the best match, we will do a final 1-away diamond
+ refining search */
+static int full_pixel_diamond(const VP9_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ int use_downsampled_sad, int *cost_list,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv) {
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme;
+ const int src_buf_stride = x->plane[0].src.stride;
+ const uint8_t *const src_buf = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int pred_buf_stride = xd->plane[0].pre[0].stride;
+ uint8_t *pred_buf;
+ vp9_sad_fn_ptr_t sad_fn_ptr;
+ unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
+ const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+
+ pred_buf =
+ xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+ start_mv_sad_even_rows =
+ fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+ start_mv_sad_odd_rows =
+ fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+ pred_buf + pred_buf_stride, pred_buf_stride);
+ start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+ start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+ sad_fn_ptr.sdf = fn_ptr->sdf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+ if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+ // If the absolute difference between the pred-to-src SAD of even rows and
+ // the pred-to-src SAD of odd rows is small, skip every other row in sad
+ // computation.
+ const int odd_to_even_diff_sad =
+ abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+ const int mult_thresh = 10;
+ if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+ sad_fn_ptr.sdf = fn_ptr->sdsf;
+ sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+ }
+ }
+
+ bestsme =
+ cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+ step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
+ if (bestsme < INT_MAX)
+ bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ *dst_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+ &temp_mv, step_param + n, sadpb, &num00,
+ &sad_fn_ptr, ref_mv);
+ if (thissme < INT_MAX)
+ thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = *dst_mv;
+ thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+ &sad_fn_ptr, ref_mv);
+ if (thissme < INT_MAX)
+ thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = best_mv;
+ }
+ }
+
+ if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+ // If we are skipping rows when we perform the motion search, we need to
+ // check the quality of skipping. If it's bad, then we run search with
+ // skip row features off.
+ const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+ const int sad =
+ fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ const int skip_sad =
+ fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+ // We will keep the result of skipping rows if it's good enough.
+ const int kSADThresh =
+ 1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+ // There is a large discrepancy between skipping and not skipping, so we
+ // need to redo the motion search.
+ return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+ further_steps, do_refine, 0, cost_list, fn_ptr,
+ ref_mv, dst_mv);
+ }
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
+ return bestsme;
+}
+
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const VP9_COMP *const cpi,
+ const MACROBLOCK *const x, MV *centre_mv_full,
+ int sadpb, int *cost_list,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+ MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int bestsme;
+ int i;
+ int interval = sf->mesh_patterns[0].interval;
+ int range = sf->mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+ (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+ range = VPXMIN(range, MAX_RANGE);
+ interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+ // initial search
+ bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+ sadpb, fn_ptr, &temp_mv);
+
+ if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhaustive_mesh_search(
+ x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
+ sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+
+ if (sf->mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ *dst_mv = temp_mv;
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
+ return bestsme;
+}
+
+#if CONFIG_NON_GREEDY_MV
+int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+ int lambda, int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const int_mv *nb_full_mvs,
+ int full_mv_num) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv);
+ int64_t best_sad;
+ int i, j;
+ vpx_clear_system_state();
+ {
+ const int64_t mv_dist = (int64_t)fn_ptr->sdf(what->buf, what->stride,
+ best_address, in_what->stride)
+ << LOG2_PRECISION;
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num);
+ best_sad = mv_dist + lambda * mv_cost;
+ }
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+ const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) &
+ ((best_full_mv->row + 1) < x->mv_limits.row_max) &
+ ((best_full_mv->col - 1) > x->mv_limits.col_min) &
+ ((best_full_mv->col + 1) < x->mv_limits.col_max);
+
+ if (all_in) {
+ unsigned int sads[4];
+ const uint8_t *const positions[4] = { best_address - in_what->stride,
+ best_address - 1, best_address + 1,
+ best_address + in_what->stride };
+
+ fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+ for (j = 0; j < 4; ++j) {
+ const MV mv = { best_full_mv->row + neighbors[j].row,
+ best_full_mv->col + neighbors[j].col };
+ const int64_t mv_dist = (int64_t)sads[j] << LOG2_PRECISION;
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ const int64_t thissad = mv_dist + lambda * mv_cost;
+ if (thissad < best_sad) {
+ best_sad = thissad;
+ best_site = j;
+ }
+ }
+ } else {
+ for (j = 0; j < 4; ++j) {
+ const MV mv = { best_full_mv->row + neighbors[j].row,
+ best_full_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ const int64_t mv_dist =
+ (int64_t)fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv),
+ in_what->stride)
+ << LOG2_PRECISION;
+ const int64_t mv_cost =
+ vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+ const int64_t thissad = mv_dist + lambda * mv_cost;
+ if (thissad < best_sad) {
+ best_sad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_full_mv->row += neighbors[best_site].row;
+ best_full_mv->col += neighbors[best_site].col;
+ best_address = get_buf_from_mv(in_what, best_full_mv);
+ }
+ }
+
+ return best_sad;
+}
+#endif // CONFIG_NON_GREEDY_MV
+
+int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+ unsigned int best_sad =
+ sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+ const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
+ ((ref_mv->row + 1) < x->mv_limits.row_max) &
+ ((ref_mv->col - 1) > x->mv_limits.col_min) &
+ ((ref_mv->col + 1) < x->mv_limits.col_max);
+
+ if (all_in) {
+ unsigned int sads[4];
+ const uint8_t *const positions[4] = { best_address - in_what->stride,
+ best_address - 1, best_address + 1,
+ best_address + in_what->stride };
+
+ sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+ sads);
+
+ for (j = 0; j < 4; ++j) {
+ if (sads[j] < best_sad) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sads[j] < best_sad) {
+ best_sad = sads[j];
+ best_site = j;
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < 4; ++j) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ sad_fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ best_address = get_buf_from_mv(in_what, ref_mv);
+ }
+ }
+
+ return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode.
+int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, const uint8_t *second_pred) {
+ const MV neighbors[8] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+ { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad = INT_MAX;
+ int i, j;
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ best_sad =
+ fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+ in_what->stride, second_pred) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+
+ for (i = 0; i < search_range; ++i) {
+ int best_site = -1;
+
+ for (j = 0; j < 8; ++j) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride, second_pred);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int search_method, int error_per_bit, int *cost_list,
+ const MV *ref_mv, MV *tmp_mv, int var_max, int rd) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
+ const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ int var = 0;
+ int run_exhaustive_search = 0;
+
+ if (cost_list) {
+ cost_list[0] = INT_MAX;
+ cost_list[1] = INT_MAX;
+ cost_list[2] = INT_MAX;
+ cost_list[3] = INT_MAX;
+ cost_list[4] = INT_MAX;
+ }
+
+ switch (method) {
+ case FAST_DIAMOND:
+ var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case FAST_HEX:
+ var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case HEX:
+ var = hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case SQUARE:
+ var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case BIGDIA:
+ var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv, tmp_mv);
+ break;
+ case NSTEP:
+ case MESH:
+ var = full_pixel_diamond(
+ cpi, x, bsize, mvp_full, step_param, error_per_bit,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+ cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
+ break;
+ default: assert(0 && "Unknown search method");
+ }
+
+ if (method == NSTEP) {
+ if (sf->exhaustive_searches_thresh < INT_MAX &&
+ !cpi->rc.is_src_frame_alt_ref) {
+ const int64_t exhaustive_thr =
+ get_exhaustive_threshold(sf->exhaustive_searches_thresh, bsize);
+ if (var > exhaustive_thr) {
+ run_exhaustive_search = 1;
+ }
+ }
+ } else if (method == MESH) {
+ run_exhaustive_search = 1;
+ }
+
+ if (run_exhaustive_search) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list,
+ fn_ptr, ref_mv, &tmp_mv_ex);
+ if (var_ex < var) {
+ var = var_ex;
+ *tmp_mv = tmp_mv_ex;
+ }
+ }
+
+ if (method != NSTEP && method != MESH && rd && var < var_max)
+ var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
+
+ return var;
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST \
+ SETUP_SUBPEL_SEARCH; \
+ \
+ (void)error_per_bit; \
+ (void)vfp; \
+ (void)z; \
+ (void)src_stride; \
+ (void)y; \
+ (void)y_stride; \
+ (void)second_pred; \
+ (void)w; \
+ (void)h; \
+ (void)offset; \
+ (void)mvjcost; \
+ (void)mvcost; \
+ (void)sse1; \
+ (void)distortion; \
+ \
+ (void)halfiters; \
+ (void)quarteriters; \
+ (void)eighthiters; \
+ (void)whichdir; \
+ (void)allow_hp; \
+ (void)forced_stop; \
+ (void)hstep; \
+ (void)rr; \
+ (void)rc; \
+ \
+ (void)tr; \
+ (void)tc; \
+ (void)sse; \
+ (void)thismse; \
+ (void)cost_list; \
+ (void)use_accurate_subpel_search
+
+// Return the maximum MV.
+uint32_t vp9_return_max_sub_pixel_mv(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ COMMON_MV_TEST;
+
+ (void)minr;
+ (void)minc;
+
+ bestmv->row = maxr;
+ bestmv->col = maxc;
+ besterr = 0;
+
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+ return besterr;
+}
+// Return the minimum MV.
+uint32_t vp9_return_min_sub_pixel_mv(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search) {
+ COMMON_MV_TEST;
+
+ (void)maxr;
+ (void)maxc;
+
+ bestmv->row = minr;
+ bestmv->col = minc;
+ besterr = 0;
+
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp && use_mv_hp(ref_mv));
+
+ return besterr;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h
new file mode 100644
index 0000000000..fd6a8b9aca
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.h
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_
+#define VPX_VP9_ENCODER_VP9_MCOMP_H_
+
+#include "vp9/encoder/vp9_block.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/encoder/vp9_non_greedy_mv.h"
+#endif // CONFIG_NON_GREEDY_MV
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
+
+typedef struct search_site_config {
+ // motion search sites
+ MV ss_mv[8 * MAX_MVSEARCH_STEPS]; // Motion vector
+ intptr_t ss_os[8 * MAX_MVSEARCH_STEPS]; // Offset
+ int searches_per_step;
+ int total_steps;
+} search_site_config;
+
+typedef struct vp9_sad_table {
+ vpx_sad_fn_t sdf;
+ vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+ const MV *mv) {
+ return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void vp9_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
+int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int vp9_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost);
+int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const vp9_variance_fn_ptr_t *vfp, int use_mvcost);
+
+struct VP9_COMP;
+struct SPEED_FEATURES;
+struct vp9_sad_table;
+
+int vp9_init_search_range(int size);
+
+int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
+ int error_per_bit, int search_range,
+ const struct vp9_sad_table *sad_fn_ptr,
+ const struct mv *center_mv);
+
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col,
+ const MV *ref_mv);
+
+typedef uint32_t(fractional_mv_step_fp)(
+ const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+ int error_per_bit, const vp9_variance_fn_ptr_t *vfp,
+ int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+ int h, int use_accurate_subpel_search);
+
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp vp9_skip_sub_pixel_tree;
+extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
+
+typedef int (*vp9_diamond_search_fn_t)(
+ const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+ uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+ int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
+
+int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, const uint8_t *second_pred);
+
+struct VP9_COMP;
+
+// "mvp_full" is the MV search starting point;
+// "ref_mv" is the context reference MV;
+// "tmp_mv" is the searched best MV.
+int vp9_full_pixel_search(const struct VP9_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int search_method,
+ int error_per_bit, int *cost_list, const MV *ref_mv,
+ MV *tmp_mv, int var_max, int rd);
+
+void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
+ const MvLimits *umv_window_limits,
+ const MV *ref_mv);
+
+#if CONFIG_NON_GREEDY_MV
+struct TplDepStats;
+int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+ int lambda, int search_range,
+ const vp9_variance_fn_ptr_t *fn_ptr,
+ const int_mv *nb_full_mvs, int full_mv_num);
+
+int vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int lambda, int do_refine,
+ const int_mv *nb_full_mvs, int full_mv_num,
+ MV *best_mv);
+
+static INLINE MV get_full_mv(const MV *mv) {
+ MV out_mv;
+ out_mv.row = mv->row >> 3;
+ out_mv.col = mv->col >> 3;
+ return out_mv;
+}
+struct TplDepFrame;
+int vp9_prepare_nb_full_mvs(const struct MotionField *motion_field, int mi_row,
+ int mi_col, int_mv *nb_full_mvs);
+
+static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) {
+ BLOCK_SIZE square_bsize;
+ switch (bsize) {
+ case BLOCK_4X4:
+ case BLOCK_4X8:
+ case BLOCK_8X4: square_bsize = BLOCK_4X4; break;
+ case BLOCK_8X8:
+ case BLOCK_8X16:
+ case BLOCK_16X8: square_bsize = BLOCK_8X8; break;
+ case BLOCK_16X16:
+ case BLOCK_16X32:
+ case BLOCK_32X16: square_bsize = BLOCK_16X16; break;
+ case BLOCK_32X32:
+ case BLOCK_32X64:
+ case BLOCK_64X32:
+ case BLOCK_64X64: square_bsize = BLOCK_32X32; break;
+ default:
+ square_bsize = BLOCK_INVALID;
+ assert(0 && "ERROR: invalid block size");
+ break;
+ }
+ return square_bsize;
+}
+#endif // CONFIG_NON_GREEDY_MV
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_MCOMP_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
new file mode 100644
index 0000000000..0843cd97e4
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+ int tile_id) {
+ RowMTInfo *row_mt_info;
+ JobQueueHandle *job_queue_hdl = NULL;
+ void *next = NULL;
+ JobNode *job_info = NULL;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_handle = NULL;
+#endif
+
+ row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
+ job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ mutex_handle = &row_mt_info->job_mutex;
+#endif
+
+// lock the mutex for queue access
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(mutex_handle);
+#endif
+ next = job_queue_hdl->next;
+ if (next != NULL) {
+ JobQueue *job_queue = (JobQueue *)next;
+ job_info = &job_queue->job_info;
+ // Update the next job in the queue
+ job_queue_hdl->next = job_queue->next;
+ job_queue_hdl->num_jobs_acquired++;
+ }
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(mutex_handle);
+#endif
+
+ return job_info;
+}
+
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+ TileDataEnc *const this_tile) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int sb_rows =
+ (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
+ int i;
+
+ CHECK_MEM_ERROR(
+ &cm->error, this_tile->row_base_thresh_freq_fact,
+ (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+ sizeof(*(this_tile->row_base_thresh_freq_fact))));
+ for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+ this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+}
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
+ struct VP9Common *cm = &cpi->common;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int tile_row, tile_col;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int jobs_per_tile_col, total_jobs;
+
+ // Allocate memory that is large enough for all row_mt stages. First pass
+ // uses 16x16 block size.
+ jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
+ // Calculate the total number of jobs
+ total_jobs = jobs_per_tile_col * tile_cols;
+
+ multi_thread_ctxt->allocated_tile_cols = tile_cols;
+ multi_thread_ctxt->allocated_tile_rows = tile_rows;
+ multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
+
+ CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
+ (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
+
+#if CONFIG_MULTITHREAD
+ // Create mutex for each tile
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+ pthread_mutex_init(&row_mt_info->job_mutex, NULL);
+ }
+#endif
+
+ // Allocate memory for row based multi-threading
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+ vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+ if (cpi->sf.adaptive_rd_thresh_row_mt) {
+ if (this_tile->row_base_thresh_freq_fact != NULL) {
+ vpx_free(this_tile->row_base_thresh_freq_fact);
+ this_tile->row_base_thresh_freq_fact = NULL;
+ }
+ vp9_row_mt_alloc_rd_thresh(cpi, this_tile);
+ }
+ }
+
+ // Assign the sync pointer of tile row zero for every tile row > 0
+ for (tile_row = 1; tile_row < tile_rows; tile_row++) {
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
+ this_tile->row_mt_sync = this_col_tile->row_mt_sync;
+ }
+ }
+
+ // Calculate the number of vertical units in the given tile row
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
+ TileInfo *tile_info = &this_tile->tile_info;
+ multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
+ get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+ }
+}
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ int tile_col;
+#if CONFIG_MULTITHREAD
+ int tile_row;
+#endif
+
+ // Deallocate memory for job queue
+ if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
+
+#if CONFIG_MULTITHREAD
+ // Destroy mutex for each tile
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+ if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);
+ }
+#endif
+
+ // Free row based multi-threading sync memory
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+ vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+ }
+
+#if CONFIG_MULTITHREAD
+ for (tile_row = 0; tile_row < multi_thread_ctxt->allocated_tile_rows;
+ tile_row++) {
+ for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+ tile_col++) {
+ TileDataEnc *this_tile =
+ &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
+ tile_col];
+ if (this_tile->row_base_thresh_freq_fact != NULL) {
+ vpx_free(this_tile->row_base_thresh_freq_fact);
+ this_tile->row_base_thresh_freq_fact = NULL;
+ }
+ }
+ }
+#endif
+}
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int i;
+
+ for (i = 0; i < tile_cols; i++) {
+ TileDataEnc *this_tile = &cpi->tile_data[i];
+ int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
+
+ // Initialize cur_col to -1 for all rows.
+ memset(this_tile->row_mt_sync.cur_col, -1,
+ sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
+ vp9_zero(this_tile->fp_data);
+ this_tile->fp_data.image_data_start_row = INVALID_ROW;
+ }
+}
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+ int tile_cols, int num_workers) {
+ int tile_id = 0;
+ int i;
+
+ // Allocating the threads for the tiles
+ for (i = 0; i < num_workers; i++) {
+ multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+ if (tile_id == tile_cols) tile_id = 0;
+ }
+}
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+ int cur_tile_id) {
+ RowMTInfo *row_mt_info;
+ JobQueueHandle *job_queue_hndl;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex;
+#endif
+ int num_jobs_remaining;
+
+ row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
+ job_queue_hndl = &row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+ mutex = &row_mt_info->job_mutex;
+#endif
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(mutex);
+#endif
+ num_jobs_remaining =
+ multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(mutex);
+#endif
+
+ return (num_jobs_remaining);
+}
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
+ VP9_COMMON *const cm = &cpi->common;
+ MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+ JobQueue *job_queue = multi_thread_ctxt->job_queue;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs;
+ const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+ int tile_col, i;
+
+ switch (job_type) {
+ case ENCODE_JOB: jobs_per_tile_col = sb_rows; break;
+ case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break;
+ case ARNR_JOB:
+ jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT);
+ break;
+ default: assert(0);
+ }
+
+ total_jobs = jobs_per_tile_col * tile_cols;
+
+ multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
+ // memset the entire job queue buffer to zero
+ memset(job_queue, 0, total_jobs * sizeof(JobQueue));
+
+ // Job queue preparation
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
+ JobQueue *job_queue_curr, *job_queue_temp;
+ int tile_row = 0;
+
+ tile_ctxt->job_queue_hdl.next = (void *)job_queue;
+ tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
+
+ job_queue_curr = job_queue;
+ job_queue_temp = job_queue;
+
+ // loop over all the vertical rows
+ for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
+ job_row_num++, jobs_per_tile++) {
+ job_queue_curr->job_info.vert_unit_row_num = job_row_num;
+ job_queue_curr->job_info.tile_col_id = tile_col;
+ job_queue_curr->job_info.tile_row_id = tile_row;
+ job_queue_curr->next = (void *)(job_queue_temp + 1);
+ job_queue_curr = ++job_queue_temp;
+
+ if (ENCODE_JOB == job_type) {
+ if (jobs_per_tile >=
+ multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
+ tile_row++;
+ jobs_per_tile = -1;
+ }
+ }
+ }
+
+ // Set the last pointer to NULL
+ job_queue_curr += -1;
+ job_queue_curr->next = (void *)NULL;
+
+ // Move to the next tile
+ job_queue += jobs_per_tile_col;
+ }
+
+ for (i = 0; i < cpi->num_workers; i++) {
+ EncWorkerData *thread_data;
+ thread_data = &cpi->tile_thr_data[i];
+ thread_data->thread_id = i;
+
+ for (tile_col = 0; tile_col < tile_cols; tile_col++)
+ thread_data->tile_completion_status[tile_col] = 0;
+ }
+}
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+ int *tile_completion_status, int *cur_tile_id,
+ int tile_cols) {
+ int tile_col;
+ int tile_id = -1; // Stores the tile ID with minimum proc done
+ int max_num_jobs_remaining = 0;
+ int num_jobs_remaining;
+
+ // Mark the completion to avoid check in the loop
+ tile_completion_status[*cur_tile_id] = 1;
+ // Check for the status of all the tiles
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ if (tile_completion_status[tile_col] == 0) {
+ num_jobs_remaining =
+ vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
+ // Mark the completion to avoid checks during future switches across tiles
+ if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
+ if (num_jobs_remaining > max_num_jobs_remaining) {
+ max_num_jobs_remaining = num_jobs_remaining;
+ tile_id = tile_col;
+ }
+ }
+ }
+
+ if (-1 == tile_id) {
+ return 1;
+ } else {
+ // Update the cur ID to the next tile ID that will be processed,
+ // which will be the least processed tile
+ *cur_tile_id = tile_id;
+ return 0;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h
new file mode 100644
index 0000000000..a2276f4fe6
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_multi_thread.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
+#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_job_queue.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+ int tile_id);
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+ int cur_tile_id);
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+ int tile_cols, int num_workers);
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
+
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+ TileDataEnc *const this_tile);
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+ int *tile_completion_status, int *cur_tile_id,
+ int tile_cols);
+
+#endif // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c
new file mode 100644
index 0000000000..9696529c50
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_noise_estimate.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+ return (!cpi->use_svc ||
+ (cpi->use_svc &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
+ ne->enabled = 0;
+ ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
+ ne->value = 0;
+ ne->count = 0;
+ ne->thresh = 90;
+ ne->last_w = 0;
+ ne->last_h = 0;
+ if (width * height >= 1920 * 1080) {
+ ne->thresh = 200;
+ } else if (width * height >= 1280 * 720) {
+ ne->thresh = 140;
+ } else if (width * height >= 640 * 360) {
+ ne->thresh = 115;
+ }
+ ne->num_frames_estimate = 15;
+ ne->adapt_thresh = (3 * ne->thresh) >> 1;
+}
+
+static int enable_noise_estimation(VP9_COMP *const cpi) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->common.use_highbitdepth) return 0;
+#endif
+// Enable noise estimation if denoising is on.
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->common.width >= 320 && cpi->common.height >= 180)
+ return 1;
+#endif
+ // Only allow noise estimate under certain encoding mode.
+ // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
+ // Not enabled for SVC mode and screen_content_mode.
+ // Not enabled for low resolutions.
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
+ cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+ cpi->common.width * cpi->common.height >= 640 * 360)
+ return 1;
+ else
+ return 0;
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void copy_frame(YV12_BUFFER_CONFIG *const dest,
+ const YV12_BUFFER_CONFIG *const src) {
+ int r;
+ const uint8_t *srcbuf = src->y_buffer;
+ uint8_t *destbuf = dest->y_buffer;
+
+ assert(dest->y_width == src->y_width);
+ assert(dest->y_height == src->y_height);
+
+ for (r = 0; r < dest->y_height; ++r) {
+ memcpy(destbuf, srcbuf, dest->y_width);
+ destbuf += dest->y_stride;
+ srcbuf += src->y_stride;
+ }
+}
+#endif // CONFIG_VP9_TEMPORAL_DENOISING
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
+ int noise_level = kLowLow;
+ if (ne->value > (ne->thresh << 1)) {
+ noise_level = kHigh;
+ } else {
+ if (ne->value > ne->thresh)
+ noise_level = kMedium;
+ else if (ne->value > (ne->thresh >> 1))
+ noise_level = kLow;
+ else
+ noise_level = kLowLow;
+ }
+ return noise_level;
+}
+
+void vp9_update_noise_estimate(VP9_COMP *const cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ NOISE_ESTIMATE *const ne = &cpi->noise_estimate;
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ // Estimate of noise level every frame_period frames.
+ int frame_period = 8;
+ int thresh_consec_zeromv = 6;
+ int frame_counter = cm->current_video_frame;
+ // Estimate is between current source and last source.
+ YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
+ last_source = &cpi->denoiser.last_source;
+ // Tune these thresholds for different resolutions when denoising is
+ // enabled.
+ if (cm->width > 640 && cm->width <= 1920) {
+ thresh_consec_zeromv = 2;
+ }
+ }
+#endif
+ ne->enabled = enable_noise_estimation(cpi);
+ if (cpi->svc.number_spatial_layers > 1)
+ frame_counter = cpi->svc.current_superframe;
+ if (!ne->enabled || frame_counter % frame_period != 0 ||
+ last_source == NULL ||
+ (cpi->svc.number_spatial_layers == 1 &&
+ (ne->last_w != cm->width || ne->last_h != cm->height))) {
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+ if (last_source != NULL) {
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ }
+ return;
+ } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+ cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+ cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
+ // Force noise estimation to 0 and denoiser off if content has high motion.
+ ne->level = kLowLow;
+ ne->count = 0;
+ ne->num_frames_estimate = 10;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
+ cpi->svc.current_superframe > 1) {
+ vp9_denoiser_set_noise_level(cpi, ne->level);
+ copy_frame(&cpi->denoiser.last_source, cpi->Source);
+ }
+#endif
+ return;
+ } else {
+ unsigned int bin_size = 100;
+ unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+ unsigned int hist_avg[MAX_VAR_HIST_BINS];
+ unsigned int max_bin = 0;
+ unsigned int max_bin_count = 0;
+ unsigned int bin_cnt;
+ int bsize = BLOCK_16X16;
+ // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
+ // been encoded as zero/small mv at least x consecutive frames, compute
+ // the variance to update estimate of noise in the source.
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const uint8_t *last_src_y = last_source->y_buffer;
+ const int last_src_ystride = last_source->y_stride;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_uvstride = cpi->Source->uv_stride;
+ int mi_row, mi_col;
+ int num_low_motion = 0;
+ int frame_low_motion = 1;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ int bl_index = mi_row * cm->mi_cols + mi_col;
+ if (cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv)
+ num_low_motion++;
+ }
+ }
+ if (num_low_motion < ((3 * cm->mi_rows * cm->mi_cols) >> 3))
+ frame_low_motion = 0;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ // 16x16 blocks, 1/4 sample of frame.
+ if (mi_row % 4 == 0 && mi_col % 4 == 0 && mi_row < cm->mi_rows - 1 &&
+ mi_col < cm->mi_cols - 1) {
+ int bl_index = mi_row * cm->mi_cols + mi_col;
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + cm->mi_cols;
+ int bl_index3 = bl_index2 + 1;
+ int consec_zeromv =
+ VPXMIN(cpi->consec_zero_mv[bl_index],
+ VPXMIN(cpi->consec_zero_mv[bl_index1],
+ VPXMIN(cpi->consec_zero_mv[bl_index2],
+ cpi->consec_zero_mv[bl_index3])));
+ // Only consider blocks that are likely steady background. i.e, have
+ // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+ // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+ // 4 sub-blocks for 16x16 block. And exclude this frame if
+ // high_source_sad is true (i.e., scene/content change).
+ if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+ !cpi->rc.high_source_sad &&
+ !cpi->svc.high_source_sad_superframe) {
+ int is_skin = 0;
+ if (cpi->use_skin_detection) {
+ is_skin =
+ vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+ src_uvstride, bsize, consec_zeromv, 0);
+ }
+ if (!is_skin) {
+ unsigned int sse;
+ // Compute variance between co-located blocks from current and
+ // last input frames.
+ unsigned int variance = cpi->fn_ptr[bsize].vf(
+ src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+ unsigned int hist_index = variance / bin_size;
+ if (hist_index < MAX_VAR_HIST_BINS)
+ hist[hist_index]++;
+ else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+ hist[MAX_VAR_HIST_BINS - 1]++; // Account for the tail
+ }
+ }
+ }
+ src_y += 8;
+ last_src_y += 8;
+ src_u += 4;
+ src_v += 4;
+ }
+ src_y += (src_ystride << 3) - (cm->mi_cols << 3);
+ last_src_y += (last_src_ystride << 3) - (cm->mi_cols << 3);
+ src_u += (src_uvstride << 2) - (cm->mi_cols << 2);
+ src_v += (src_uvstride << 2) - (cm->mi_cols << 2);
+ }
+ ne->last_w = cm->width;
+ ne->last_h = cm->height;
+ // Adjust histogram to account for effect that histogram flattens
+ // and shifts to zero as scene darkens.
+ if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+ hist[0] = 0;
+ hist[1] >>= 2;
+ hist[2] >>= 2;
+ hist[3] >>= 2;
+ hist[4] >>= 1;
+ hist[5] >>= 1;
+ hist[6] = 3 * hist[6] >> 1;
+ hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+ }
+
+ // Average hist[] and find largest bin
+ for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+ if (bin_cnt == 0)
+ hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+ hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+ else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+ hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+ (hist[bin_cnt + 1] >> 1) + 2) >>
+ 2;
+ else
+ hist_avg[bin_cnt] =
+ (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+ 2;
+
+ if (hist_avg[bin_cnt] > max_bin_count) {
+ max_bin_count = hist_avg[bin_cnt];
+ max_bin = bin_cnt;
+ }
+ }
+
+ // Scale by 40 to work with existing thresholds
+ ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+ // Quickly increase VNR strength when the noise level increases suddenly.
+ if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+ ne->count = ne->num_frames_estimate;
+ } else {
+ ne->count++;
+ }
+ if (ne->count == ne->num_frames_estimate) {
+ // Reset counter and check noise level condition.
+ ne->num_frames_estimate = 30;
+ ne->count = 0;
+ ne->level = vp9_noise_estimate_extract_level(ne);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ vp9_denoiser_set_noise_level(cpi, ne->level);
+#endif
+ }
+ }
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+ copy_frame(&cpi->denoiser.last_source, cpi->Source);
+#endif
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h
new file mode 100644
index 0000000000..7fc94ff8c9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_noise_estimate.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
+#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+#include "vp9/encoder/vp9_denoiser.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_VAR_HIST_BINS 20
+
+typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
+
+typedef struct noise_estimate {
+ int enabled;
+ NOISE_LEVEL level;
+ int value;
+ int thresh;
+ int adapt_thresh;
+ int count;
+ int last_w;
+ int last_h;
+ int num_frames_estimate;
+} NOISE_ESTIMATE;
+
+struct VP9_COMP;
+
+void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height);
+
+NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne);
+
+void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c
new file mode 100644
index 0000000000..d52801c845
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_mv.h"
+#include "vp9/encoder/vp9_non_greedy_mv.h"
+// TODO(angiebird): move non_greedy_mv related functions to this file
+
+#define LOG2_TABLE_SIZE 1024
+static const int log2_table[LOG2_TABLE_SIZE] = {
+ 0, // This is a dummy value
+ 0, 1048576, 1661954, 2097152, 2434718, 2710530, 2943725,
+ 3145728, 3323907, 3483294, 3627477, 3759106, 3880192, 3992301,
+ 4096672, 4194304, 4286015, 4372483, 4454275, 4531870, 4605679,
+ 4676053, 4743299, 4807682, 4869436, 4928768, 4985861, 5040877,
+ 5093962, 5145248, 5194851, 5242880, 5289431, 5334591, 5378443,
+ 5421059, 5462508, 5502851, 5542146, 5580446, 5617800, 5654255,
+ 5689851, 5724629, 5758625, 5791875, 5824409, 5856258, 5887450,
+ 5918012, 5947969, 5977344, 6006160, 6034437, 6062195, 6089453,
+ 6116228, 6142538, 6168398, 6193824, 6218829, 6243427, 6267632,
+ 6291456, 6314910, 6338007, 6360756, 6383167, 6405252, 6427019,
+ 6448477, 6469635, 6490501, 6511084, 6531390, 6551427, 6571202,
+ 6590722, 6609993, 6629022, 6647815, 6666376, 6684713, 6702831,
+ 6720734, 6738427, 6755916, 6773205, 6790299, 6807201, 6823917,
+ 6840451, 6856805, 6872985, 6888993, 6904834, 6920510, 6936026,
+ 6951384, 6966588, 6981641, 6996545, 7011304, 7025920, 7040397,
+ 7054736, 7068940, 7083013, 7096956, 7110771, 7124461, 7138029,
+ 7151476, 7164804, 7178017, 7191114, 7204100, 7216974, 7229740,
+ 7242400, 7254954, 7267405, 7279754, 7292003, 7304154, 7316208,
+ 7328167, 7340032, 7351805, 7363486, 7375079, 7386583, 7398000,
+ 7409332, 7420579, 7431743, 7442826, 7453828, 7464751, 7475595,
+ 7486362, 7497053, 7507669, 7518211, 7528680, 7539077, 7549404,
+ 7559660, 7569847, 7579966, 7590017, 7600003, 7609923, 7619778,
+ 7629569, 7639298, 7648964, 7658569, 7668114, 7677598, 7687023,
+ 7696391, 7705700, 7714952, 7724149, 7733289, 7742375, 7751407,
+ 7760385, 7769310, 7778182, 7787003, 7795773, 7804492, 7813161,
+ 7821781, 7830352, 7838875, 7847350, 7855777, 7864158, 7872493,
+ 7880782, 7889027, 7897226, 7905381, 7913492, 7921561, 7929586,
+ 7937569, 7945510, 7953410, 7961268, 7969086, 7976864, 7984602,
+ 7992301, 7999960, 8007581, 8015164, 8022709, 8030217, 8037687,
+ 8045121, 8052519, 8059880, 8067206, 8074496, 8081752, 8088973,
+ 8096159, 8103312, 8110431, 8117516, 8124569, 8131589, 8138576,
+ 8145532, 8152455, 8159347, 8166208, 8173037, 8179836, 8186605,
+ 8193343, 8200052, 8206731, 8213380, 8220001, 8226593, 8233156,
+ 8239690, 8246197, 8252676, 8259127, 8265550, 8271947, 8278316,
+ 8284659, 8290976, 8297266, 8303530, 8309768, 8315981, 8322168,
+ 8328330, 8334467, 8340579, 8346667, 8352730, 8358769, 8364784,
+ 8370775, 8376743, 8382687, 8388608, 8394506, 8400381, 8406233,
+ 8412062, 8417870, 8423655, 8429418, 8435159, 8440878, 8446576,
+ 8452252, 8457908, 8463542, 8469155, 8474748, 8480319, 8485871,
+ 8491402, 8496913, 8502404, 8507875, 8513327, 8518759, 8524171,
+ 8529564, 8534938, 8540293, 8545629, 8550947, 8556245, 8561525,
+ 8566787, 8572031, 8577256, 8582464, 8587653, 8592825, 8597980,
+ 8603116, 8608236, 8613338, 8618423, 8623491, 8628542, 8633576,
+ 8638593, 8643594, 8648579, 8653547, 8658499, 8663434, 8668354,
+ 8673258, 8678145, 8683017, 8687874, 8692715, 8697540, 8702350,
+ 8707145, 8711925, 8716690, 8721439, 8726174, 8730894, 8735599,
+ 8740290, 8744967, 8749628, 8754276, 8758909, 8763528, 8768134,
+ 8772725, 8777302, 8781865, 8786415, 8790951, 8795474, 8799983,
+ 8804478, 8808961, 8813430, 8817886, 8822328, 8826758, 8831175,
+ 8835579, 8839970, 8844349, 8848715, 8853068, 8857409, 8861737,
+ 8866053, 8870357, 8874649, 8878928, 8883195, 8887451, 8891694,
+ 8895926, 8900145, 8904353, 8908550, 8912734, 8916908, 8921069,
+ 8925220, 8929358, 8933486, 8937603, 8941708, 8945802, 8949885,
+ 8953957, 8958018, 8962068, 8966108, 8970137, 8974155, 8978162,
+ 8982159, 8986145, 8990121, 8994086, 8998041, 9001986, 9005920,
+ 9009844, 9013758, 9017662, 9021556, 9025440, 9029314, 9033178,
+ 9037032, 9040877, 9044711, 9048536, 9052352, 9056157, 9059953,
+ 9063740, 9067517, 9071285, 9075044, 9078793, 9082533, 9086263,
+ 9089985, 9093697, 9097400, 9101095, 9104780, 9108456, 9112123,
+ 9115782, 9119431, 9123072, 9126704, 9130328, 9133943, 9137549,
+ 9141146, 9144735, 9148316, 9151888, 9155452, 9159007, 9162554,
+ 9166092, 9169623, 9173145, 9176659, 9180165, 9183663, 9187152,
+ 9190634, 9194108, 9197573, 9201031, 9204481, 9207923, 9211357,
+ 9214784, 9218202, 9221613, 9225017, 9228412, 9231800, 9235181,
+ 9238554, 9241919, 9245277, 9248628, 9251971, 9255307, 9258635,
+ 9261956, 9265270, 9268577, 9271876, 9275169, 9278454, 9281732,
+ 9285002, 9288266, 9291523, 9294773, 9298016, 9301252, 9304481,
+ 9307703, 9310918, 9314126, 9317328, 9320523, 9323711, 9326892,
+ 9330067, 9333235, 9336397, 9339552, 9342700, 9345842, 9348977,
+ 9352106, 9355228, 9358344, 9361454, 9364557, 9367654, 9370744,
+ 9373828, 9376906, 9379978, 9383043, 9386102, 9389155, 9392202,
+ 9395243, 9398278, 9401306, 9404329, 9407345, 9410356, 9413360,
+ 9416359, 9419351, 9422338, 9425319, 9428294, 9431263, 9434226,
+ 9437184, 9440136, 9443082, 9446022, 9448957, 9451886, 9454809,
+ 9457726, 9460638, 9463545, 9466446, 9469341, 9472231, 9475115,
+ 9477994, 9480867, 9483735, 9486597, 9489454, 9492306, 9495152,
+ 9497993, 9500828, 9503659, 9506484, 9509303, 9512118, 9514927,
+ 9517731, 9520530, 9523324, 9526112, 9528895, 9531674, 9534447,
+ 9537215, 9539978, 9542736, 9545489, 9548237, 9550980, 9553718,
+ 9556451, 9559179, 9561903, 9564621, 9567335, 9570043, 9572747,
+ 9575446, 9578140, 9580830, 9583514, 9586194, 9588869, 9591540,
+ 9594205, 9596866, 9599523, 9602174, 9604821, 9607464, 9610101,
+ 9612735, 9615363, 9617987, 9620607, 9623222, 9625832, 9628438,
+ 9631040, 9633637, 9636229, 9638818, 9641401, 9643981, 9646556,
+ 9649126, 9651692, 9654254, 9656812, 9659365, 9661914, 9664459,
+ 9666999, 9669535, 9672067, 9674594, 9677118, 9679637, 9682152,
+ 9684663, 9687169, 9689672, 9692170, 9694665, 9697155, 9699641,
+ 9702123, 9704601, 9707075, 9709545, 9712010, 9714472, 9716930,
+ 9719384, 9721834, 9724279, 9726721, 9729159, 9731593, 9734024,
+ 9736450, 9738872, 9741291, 9743705, 9746116, 9748523, 9750926,
+ 9753326, 9755721, 9758113, 9760501, 9762885, 9765266, 9767642,
+ 9770015, 9772385, 9774750, 9777112, 9779470, 9781825, 9784175,
+ 9786523, 9788866, 9791206, 9793543, 9795875, 9798204, 9800530,
+ 9802852, 9805170, 9807485, 9809797, 9812104, 9814409, 9816710,
+ 9819007, 9821301, 9823591, 9825878, 9828161, 9830441, 9832718,
+ 9834991, 9837261, 9839527, 9841790, 9844050, 9846306, 9848559,
+ 9850808, 9853054, 9855297, 9857537, 9859773, 9862006, 9864235,
+ 9866462, 9868685, 9870904, 9873121, 9875334, 9877544, 9879751,
+ 9881955, 9884155, 9886352, 9888546, 9890737, 9892925, 9895109,
+ 9897291, 9899469, 9901644, 9903816, 9905985, 9908150, 9910313,
+ 9912473, 9914629, 9916783, 9918933, 9921080, 9923225, 9925366,
+ 9927504, 9929639, 9931771, 9933900, 9936027, 9938150, 9940270,
+ 9942387, 9944502, 9946613, 9948721, 9950827, 9952929, 9955029,
+ 9957126, 9959219, 9961310, 9963398, 9965484, 9967566, 9969645,
+ 9971722, 9973796, 9975866, 9977934, 9980000, 9982062, 9984122,
+ 9986179, 9988233, 9990284, 9992332, 9994378, 9996421, 9998461,
+ 10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665,
+ 10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738,
+ 10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681,
+ 10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496,
+ 10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186,
+ 10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754,
+ 10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201,
+ 10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529,
+ 10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742,
+ 10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839,
+ 10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825,
+ 10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699,
+ 10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465,
+ 10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125,
+ 10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679,
+ 10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130,
+ 10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479,
+ 10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728,
+ 10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879,
+ 10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933,
+ 10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892,
+ 10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757,
+ 10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530,
+ 10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211,
+ 10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804,
+ 10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308,
+ 10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725,
+ 10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057,
+ 10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304,
+ 10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468,
+ 10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551,
+ 10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553,
+ 10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476,
+ 10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320,
+ 10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087,
+ 10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778,
+ 10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394,
+ 10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936,
+ 10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405,
+ 10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802,
+ 10484282,
+};
+
+static int mi_size_to_block_size(int mi_bsize, int mi_num) {
+ return (mi_num % mi_bsize) ? mi_num / mi_bsize + 1 : mi_num / mi_bsize;
+}
+
+Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
+ int frame_num, int mi_rows, int mi_cols) {
+ int frame_idx, rf_idx, square_block_idx;
+ if (motion_field_info->allocated) {
+ // TODO(angiebird): Avoid re-allocate buffer if possible
+ vp9_free_motion_field_info(motion_field_info);
+ }
+ motion_field_info->frame_num = frame_num;
+ motion_field_info->motion_field_array =
+ vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array));
+ if (!motion_field_info->motion_field_array) return STATUS_FAILED;
+ for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) {
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+ ++square_block_idx) {
+ BLOCK_SIZE bsize = square_block_idx_to_bsize(square_block_idx);
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int block_rows = mi_size_to_block_size(mi_height, mi_rows);
+ const int block_cols = mi_size_to_block_size(mi_width, mi_cols);
+ MotionField *motion_field =
+ &motion_field_info
+ ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+ Status status =
+ vp9_alloc_motion_field(motion_field, bsize, block_rows, block_cols);
+ if (status == STATUS_FAILED) {
+ return STATUS_FAILED;
+ }
+ }
+ }
+ }
+ motion_field_info->allocated = 1;
+ return STATUS_OK;
+}
+
+Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize,
+ int block_rows, int block_cols) {
+ Status status = STATUS_OK;
+ motion_field->ready = 0;
+ motion_field->bsize = bsize;
+ motion_field->block_rows = block_rows;
+ motion_field->block_cols = block_cols;
+ motion_field->block_num = block_rows * block_cols;
+ motion_field->mf =
+ vpx_calloc(motion_field->block_num, sizeof(*motion_field->mf));
+ if (motion_field->mf == NULL) {
+ status = STATUS_FAILED;
+ }
+ motion_field->set_mv =
+ vpx_calloc(motion_field->block_num, sizeof(*motion_field->set_mv));
+ if (motion_field->set_mv == NULL) {
+ vpx_free(motion_field->mf);
+ motion_field->mf = NULL;
+ status = STATUS_FAILED;
+ }
+ motion_field->local_structure = vpx_calloc(
+ motion_field->block_num, sizeof(*motion_field->local_structure));
+ if (motion_field->local_structure == NULL) {
+ vpx_free(motion_field->mf);
+ motion_field->mf = NULL;
+ vpx_free(motion_field->set_mv);
+ motion_field->set_mv = NULL;
+ status = STATUS_FAILED;
+ }
+ return status;
+}
+
+void vp9_free_motion_field(MotionField *motion_field) {
+ vpx_free(motion_field->mf);
+ vpx_free(motion_field->set_mv);
+ vpx_free(motion_field->local_structure);
+ vp9_zero(*motion_field);
+}
+
+void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info) {
+ if (motion_field_info->allocated) {
+ int frame_idx, rf_idx, square_block_idx;
+ for (frame_idx = 0; frame_idx < motion_field_info->frame_num; ++frame_idx) {
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+ ++square_block_idx) {
+ MotionField *motion_field =
+ &motion_field_info
+ ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+ vp9_free_motion_field(motion_field);
+ }
+ }
+ }
+ vpx_free(motion_field_info->motion_field_array);
+ motion_field_info->motion_field_array = NULL;
+ motion_field_info->frame_num = 0;
+ motion_field_info->allocated = 0;
+ }
+}
+
+MotionField *vp9_motion_field_info_get_motion_field(
+ MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx,
+ BLOCK_SIZE bsize) {
+ int square_block_idx = get_square_block_idx(bsize);
+ assert(frame_idx < motion_field_info->frame_num);
+ assert(motion_field_info->allocated == 1);
+ return &motion_field_info
+ ->motion_field_array[frame_idx][rf_idx][square_block_idx];
+}
+
+int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow,
+ int bcol) {
+ assert(brow >= 0 && brow < motion_field->block_rows);
+ assert(bcol >= 0 && bcol < motion_field->block_cols);
+ return motion_field->set_mv[brow * motion_field->block_cols + bcol];
+}
+
+int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow,
+ int bcol) {
+ assert(brow >= 0 && brow < motion_field->block_rows);
+ assert(bcol >= 0 && bcol < motion_field->block_cols);
+ return motion_field->mf[brow * motion_field->block_cols + bcol];
+}
+
+int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row,
+ int mi_col) {
+ const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+ const int brow = mi_row / mi_height;
+ const int bcol = mi_col / mi_width;
+ assert(mi_row % mi_height == 0);
+ assert(mi_col % mi_width == 0);
+ return vp9_motion_field_get_mv(motion_field, brow, bcol);
+}
+
+void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row,
+ int mi_col, int_mv mv) {
+ const int mi_height = num_8x8_blocks_high_lookup[motion_field->bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[motion_field->bsize];
+ const int brow = mi_row / mi_height;
+ const int bcol = mi_col / mi_width;
+ assert(mi_row % mi_height == 0);
+ assert(mi_col % mi_width == 0);
+ assert(brow >= 0 && brow < motion_field->block_rows);
+ assert(bcol >= 0 && bcol < motion_field->block_cols);
+ motion_field->mf[brow * motion_field->block_cols + bcol] = mv;
+ motion_field->set_mv[brow * motion_field->block_cols + bcol] = 1;
+}
+
+void vp9_motion_field_reset_mvs(MotionField *motion_field) {
+ memset(motion_field->set_mv, 0,
+ motion_field->block_num * sizeof(*motion_field->set_mv));
+}
+
+static int64_t log2_approximation(int64_t v) {
+ assert(v > 0);
+ if (v < LOG2_TABLE_SIZE) {
+ return log2_table[v];
+ } else {
+ // use linear approximation when v >= 2^10
+ const int slope =
+ 1477; // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION)
+ assert(LOG2_TABLE_SIZE == 1 << 10);
+
+ return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION);
+ }
+}
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs,
+ int mv_num) {
+ // The behavior of this function is to compute log2 of mv difference,
+ // i.e. min log2(1 + row_diff * row_diff + col_diff * col_diff)
+ // against available neighbor mvs.
+ // Since the log2 is monotonically increasing, we can compute
+ // min row_diff * row_diff + col_diff * col_diff first
+ // then apply log2 in the end.
+ int i;
+ int64_t min_abs_diff = INT64_MAX;
+ int cnt = 0;
+ assert(mv_num <= NB_MVS_NUM);
+ for (i = 0; i < mv_num; ++i) {
+ MV nb_mv = nb_full_mvs[i].as_mv;
+ const int64_t row_diff = abs(mv->row - nb_mv.row);
+ const int64_t col_diff = abs(mv->col - nb_mv.col);
+ const int64_t abs_diff = row_diff * row_diff + col_diff * col_diff;
+ assert(nb_full_mvs[i].as_int != INVALID_MV);
+ min_abs_diff = VPXMIN(abs_diff, min_abs_diff);
+ ++cnt;
+ }
+ if (cnt) {
+ return log2_approximation(1 + min_abs_diff);
+ }
+ return 0;
+}
+
+static FloatMV get_smooth_motion_vector(const FloatMV scaled_search_mv,
+ const FloatMV *tmp_mf,
+ const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+ int rows, int cols, int row, int col,
+ float alpha) {
+ const FloatMV tmp_mv = tmp_mf[row * cols + col];
+ int idx_row, idx_col;
+ FloatMV avg_nb_mv = { 0.0f, 0.0f };
+ FloatMV mv = { 0.0f, 0.0f };
+ float filter[3][3] = { { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f },
+ { 1.0f / 6.0f, 0.0f, 1.0f / 6.0f },
+ { 1.0f / 12.0f, 1.0f / 6.0f, 1.0f / 12.0f } };
+ for (idx_row = 0; idx_row < 3; ++idx_row) {
+ int nb_row = row + idx_row - 1;
+ for (idx_col = 0; idx_col < 3; ++idx_col) {
+ int nb_col = col + idx_col - 1;
+ if (nb_row < 0 || nb_col < 0 || nb_row >= rows || nb_col >= cols) {
+ avg_nb_mv.row += (tmp_mv.row) * filter[idx_row][idx_col];
+ avg_nb_mv.col += (tmp_mv.col) * filter[idx_row][idx_col];
+ } else {
+ const FloatMV nb_mv = tmp_mf[nb_row * cols + nb_col];
+ avg_nb_mv.row += (nb_mv.row) * filter[idx_row][idx_col];
+ avg_nb_mv.col += (nb_mv.col) * filter[idx_row][idx_col];
+ }
+ }
+ }
+ {
+ // M is the local variance of reference frame
+ float M00 = M[row * cols + col][0];
+ float M01 = M[row * cols + col][1];
+ float M10 = M[row * cols + col][2];
+ float M11 = M[row * cols + col][3];
+
+ float det = (M00 + alpha) * (M11 + alpha) - M01 * M10;
+
+ float inv_M00 = (M11 + alpha) / det;
+ float inv_M01 = -M01 / det;
+ float inv_M10 = -M10 / det;
+ float inv_M11 = (M00 + alpha) / det;
+
+ float inv_MM00 = inv_M00 * M00 + inv_M01 * M10;
+ float inv_MM01 = inv_M00 * M01 + inv_M01 * M11;
+ float inv_MM10 = inv_M10 * M00 + inv_M11 * M10;
+ float inv_MM11 = inv_M10 * M01 + inv_M11 * M11;
+
+ mv.row = inv_M00 * avg_nb_mv.row * alpha + inv_M01 * avg_nb_mv.col * alpha +
+ inv_MM00 * scaled_search_mv.row + inv_MM01 * scaled_search_mv.col;
+ mv.col = inv_M10 * avg_nb_mv.row * alpha + inv_M11 * avg_nb_mv.col * alpha +
+ inv_MM10 * scaled_search_mv.row + inv_MM11 * scaled_search_mv.col;
+ }
+ return mv;
+}
+
+void vp9_get_smooth_motion_field(const MV *search_mf,
+ const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+ int rows, int cols, BLOCK_SIZE bsize,
+ float alpha, int num_iters, MV *smooth_mf) {
+ // M is the local variation of reference frame
+ // build two buffers
+ FloatMV *input = (FloatMV *)malloc(rows * cols * sizeof(FloatMV));
+ FloatMV *output = (FloatMV *)malloc(rows * cols * sizeof(FloatMV));
+ int idx;
+ int row, col;
+ int bw = 4 << b_width_log2_lookup[bsize];
+ int bh = 4 << b_height_log2_lookup[bsize];
+ if (!(input && output)) goto fail;
+ // copy search results to input buffer
+ for (idx = 0; idx < rows * cols; ++idx) {
+ input[idx].row = (float)search_mf[idx].row / bh;
+ input[idx].col = (float)search_mf[idx].col / bw;
+ }
+ for (idx = 0; idx < num_iters; ++idx) {
+ FloatMV *tmp;
+ for (row = 0; row < rows; ++row) {
+ for (col = 0; col < cols; ++col) {
+ // note: the scaled_search_mf and smooth_mf are all scaled by macroblock
+ // size
+ const MV search_mv = search_mf[row * cols + col];
+ FloatMV scaled_search_mv = { (float)search_mv.row / bh,
+ (float)search_mv.col / bw };
+ output[row * cols + col] = get_smooth_motion_vector(
+ scaled_search_mv, input, M, rows, cols, row, col, alpha);
+ }
+ }
+ // swap buffers
+ tmp = input;
+ input = output;
+ output = tmp;
+ }
+ // copy smoothed results to output
+ for (idx = 0; idx < rows * cols; ++idx) {
+ smooth_mf[idx].row = (int)(input[idx].row * bh);
+ smooth_mf[idx].col = (int)(input[idx].col * bw);
+ }
+fail:
+ free(input);
+ free(output);
+}
+
+void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const MV *search_mf,
+ const vp9_variance_fn_ptr_t *fn_ptr, int rows,
+ int cols, BLOCK_SIZE bsize,
+ int (*M)[MF_LOCAL_STRUCTURE_SIZE]) {
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int cur_stride = cur_frame->y_stride;
+ const int ref_stride = ref_frame->y_stride;
+ const int width = ref_frame->y_width;
+ const int height = ref_frame->y_height;
+ int row, col;
+ for (row = 0; row < rows; ++row) {
+ for (col = 0; col < cols; ++col) {
+ int cur_offset = row * bh * cur_stride + col * bw;
+ uint8_t *center = cur_frame->y_buffer + cur_offset;
+ int ref_h = row * bh + search_mf[row * cols + col].row;
+ int ref_w = col * bw + search_mf[row * cols + col].col;
+ int ref_offset;
+ uint8_t *target;
+ uint8_t *nb;
+ int search_dist;
+ int nb_dist;
+ int I_row = 0, I_col = 0;
+ // TODO(Dan): handle the case that when reference frame block beyond the
+ // boundary
+ ref_h = ref_h < 0 ? 0 : (ref_h >= height - bh ? height - bh - 1 : ref_h);
+ ref_w = ref_w < 0 ? 0 : (ref_w >= width - bw ? width - bw - 1 : ref_w);
+ // compute search results distortion
+ // TODO(Dan): maybe need to use vp9 function to find the reference block,
+ // to compare with the results of my python code, I first use my way to
+ // compute the reference block
+ ref_offset = ref_h * ref_stride + ref_w;
+ target = ref_frame->y_buffer + ref_offset;
+ search_dist = fn_ptr->sdf(center, cur_stride, target, ref_stride);
+ // compute target's neighbors' distortions
+ // TODO(Dan): if using padding, the boundary condition may vary
+ // up
+ if (ref_h - bh >= 0) {
+ nb = target - ref_stride * bh;
+ nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+ I_row += nb_dist - search_dist;
+ }
+ // down
+ if (ref_h + bh < height - bh) {
+ nb = target + ref_stride * bh;
+ nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+ I_row += nb_dist - search_dist;
+ }
+ if (ref_h - bh >= 0 && ref_h + bh < height - bh) {
+ I_row /= 2;
+ }
+ I_row /= (bw * bh);
+ // left
+ if (ref_w - bw >= 0) {
+ nb = target - bw;
+ nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+ I_col += nb_dist - search_dist;
+ }
+ // down
+ if (ref_w + bw < width - bw) {
+ nb = target + bw;
+ nb_dist = fn_ptr->sdf(center, cur_stride, nb, ref_stride);
+ I_col += nb_dist - search_dist;
+ }
+ if (ref_w - bw >= 0 && ref_w + bw < width - bw) {
+ I_col /= 2;
+ }
+ I_col /= (bw * bh);
+ M[row * cols + col][0] = I_row * I_row;
+ M[row * cols + col][1] = I_row * I_col;
+ M[row * cols + col][2] = I_col * I_row;
+ M[row * cols + col][3] = I_col * I_col;
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h
new file mode 100644
index 0000000000..c2bd69722a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_non_greedy_mv.h
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
+#define VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
+
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_scale/yv12config.h"
+#include "vpx_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define NB_MVS_NUM 4
+#define LOG2_PRECISION 20
+#define MF_LOCAL_STRUCTURE_SIZE 4
+#define SQUARE_BLOCK_SIZES 4
+
+typedef enum Status { STATUS_OK = 0, STATUS_FAILED = 1 } Status;
+
+typedef struct MotionField {
+ int ready;
+ BLOCK_SIZE bsize;
+ int block_rows;
+ int block_cols;
+ int block_num; // block_num == block_rows * block_cols
+ int (*local_structure)[MF_LOCAL_STRUCTURE_SIZE];
+ int_mv *mf;
+ int *set_mv;
+ int mv_log_scale;
+} MotionField;
+
+typedef struct MotionFieldInfo {
+ int frame_num;
+ int allocated;
+ MotionField (*motion_field_array)[MAX_INTER_REF_FRAMES][SQUARE_BLOCK_SIZES];
+} MotionFieldInfo;
+
+typedef struct {
+ float row, col;
+} FloatMV;
+
+static INLINE int get_square_block_idx(BLOCK_SIZE bsize) {
+ if (bsize == BLOCK_4X4) {
+ return 0;
+ }
+ if (bsize == BLOCK_8X8) {
+ return 1;
+ }
+ if (bsize == BLOCK_16X16) {
+ return 2;
+ }
+ if (bsize == BLOCK_32X32) {
+ return 3;
+ }
+ assert(0 && "ERROR: non-square block size");
+ return -1;
+}
+
+static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) {
+ if (square_block_idx == 0) {
+ return BLOCK_4X4;
+ }
+ if (square_block_idx == 1) {
+ return BLOCK_8X8;
+ }
+ if (square_block_idx == 2) {
+ return BLOCK_16X16;
+ }
+ if (square_block_idx == 3) {
+ return BLOCK_32X32;
+ }
+ assert(0 && "ERROR: invalid square_block_idx");
+ return BLOCK_INVALID;
+}
+
+Status vp9_alloc_motion_field_info(MotionFieldInfo *motion_field_info,
+ int frame_num, int mi_rows, int mi_cols);
+
+Status vp9_alloc_motion_field(MotionField *motion_field, BLOCK_SIZE bsize,
+ int block_rows, int block_cols);
+
+void vp9_free_motion_field(MotionField *motion_field);
+
+void vp9_free_motion_field_info(MotionFieldInfo *motion_field_info);
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_full_mvs,
+ int mv_num);
+
+void vp9_get_smooth_motion_field(const MV *search_mf,
+ const int (*M)[MF_LOCAL_STRUCTURE_SIZE],
+ int rows, int cols, BLOCK_SIZE bize,
+ float alpha, int num_iters, MV *smooth_mf);
+
+void vp9_get_local_structure(const YV12_BUFFER_CONFIG *cur_frame,
+ const YV12_BUFFER_CONFIG *ref_frame,
+ const MV *search_mf,
+ const vp9_variance_fn_ptr_t *fn_ptr, int rows,
+ int cols, BLOCK_SIZE bsize,
+ int (*M)[MF_LOCAL_STRUCTURE_SIZE]);
+
+MotionField *vp9_motion_field_info_get_motion_field(
+ MotionFieldInfo *motion_field_info, int frame_idx, int rf_idx,
+ BLOCK_SIZE bsize);
+
+void vp9_motion_field_mi_set_mv(MotionField *motion_field, int mi_row,
+ int mi_col, int_mv mv);
+
+void vp9_motion_field_reset_mvs(MotionField *motion_field);
+
+int_mv vp9_motion_field_get_mv(const MotionField *motion_field, int brow,
+ int bcol);
+int_mv vp9_motion_field_mi_get_mv(const MotionField *motion_field, int mi_row,
+ int mi_col);
+int vp9_motion_field_is_mv_set(const MotionField *motion_field, int brow,
+ int bcol);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // VPX_VP9_ENCODER_VP9_NON_GREEDY_MV_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h
new file mode 100644
index 0000000000..09c0e30a47
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_partition_models.h
@@ -0,0 +1,975 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+// Neural net model config. It defines the layout of a neural net model, such as
+// the number of inputs/outputs, number of layers, the number of nodes in each
+// layer, as well as the weights and bias of each node.
+typedef struct {
+ int num_inputs; // Number of input nodes, i.e. features.
+ int num_outputs; // Number of output nodes.
+ int num_hidden_layers; // Number of hidden layers, maximum 10.
+ // Number of nodes for each hidden layer.
+ int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+ // Weight parameters, indexed by layer.
+ const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+ // Bias parameters, indexed by layer.
+ const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Partition search breakout model.
+#define FEATURES 4
+#define Q_CTX 3
+#define RESOLUTION_CTX 2
+static const float
+ vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+ {
+ {
+ -0.016673f,
+ -0.001025f,
+ -0.000032f,
+ 0.000833f,
+ 1.94261885f - 2.1f,
+ },
+ {
+ -0.160867f,
+ -0.002101f,
+ 0.000011f,
+ 0.002448f,
+ 1.65738142f - 2.5f,
+ },
+ {
+ -0.628934f,
+ -0.011459f,
+ -0.000009f,
+ 0.013833f,
+ 1.47982645f - 1.6f,
+ },
+ },
+ {
+ {
+ -0.064309f,
+ -0.006121f,
+ 0.000232f,
+ 0.005778f,
+ 0.7989465f - 5.0f,
+ },
+ {
+ -0.314957f,
+ -0.009346f,
+ -0.000225f,
+ 0.010072f,
+ 2.80695581f - 5.5f,
+ },
+ {
+ -0.635535f,
+ -0.015135f,
+ 0.000091f,
+ 0.015247f,
+ 2.90381241f - 5.0f,
+ },
+ },
+ };
+
+static const float
+ vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+ {
+ {
+ -0.010554f,
+ -0.003081f,
+ -0.000134f,
+ 0.004491f,
+ 1.68445992f - 3.5f,
+ },
+ {
+ -0.051489f,
+ -0.007609f,
+ 0.000016f,
+ 0.009792f,
+ 1.28089404f - 2.5f,
+ },
+ {
+ -0.163097f,
+ -0.013081f,
+ 0.000022f,
+ 0.019006f,
+ 1.36129403f - 3.2f,
+ },
+ },
+ {
+ {
+ -0.024629f,
+ -0.006492f,
+ -0.000254f,
+ 0.004895f,
+ 1.27919173f - 4.5f,
+ },
+ {
+ -0.083936f,
+ -0.009827f,
+ -0.000200f,
+ 0.010399f,
+ 2.73731065f - 4.5f,
+ },
+ {
+ -0.279052f,
+ -0.013334f,
+ 0.000289f,
+ 0.023203f,
+ 2.43595719f - 3.5f,
+ },
+ },
+ };
+
+static const float
+ vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+ {
+ {
+ -0.013154f,
+ -0.002404f,
+ -0.000977f,
+ 0.008450f,
+ 2.57404566f - 5.5f,
+ },
+ {
+ -0.019146f,
+ -0.004018f,
+ 0.000064f,
+ 0.008187f,
+ 2.15043926f - 2.5f,
+ },
+ {
+ -0.075755f,
+ -0.010858f,
+ 0.000030f,
+ 0.024505f,
+ 2.06848121f - 2.5f,
+ },
+ },
+ {
+ {
+ -0.007636f,
+ -0.002751f,
+ -0.000682f,
+ 0.005968f,
+ 0.19225763f - 4.5f,
+ },
+ {
+ -0.047306f,
+ -0.009113f,
+ -0.000518f,
+ 0.016007f,
+ 2.61068869f - 4.0f,
+ },
+ {
+ -0.069336f,
+ -0.010448f,
+ -0.001120f,
+ 0.023083f,
+ 1.47591054f - 5.5f,
+ },
+ },
+ };
+
+static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX]
+ [FEATURES + 1] = {
+ {
+ {
+ -0.011807f,
+ -0.009873f,
+ -0.000931f,
+ 0.034768f,
+ 1.32254851f - 2.0f,
+ },
+ {
+ -0.003861f,
+ -0.002701f,
+ 0.000100f,
+ 0.013876f,
+ 1.96755111f - 1.5f,
+ },
+ {
+ -0.013522f,
+ -0.008677f,
+ -0.000562f,
+ 0.034468f,
+ 1.53440356f - 1.5f,
+ },
+ },
+ {
+ {
+ -0.003221f,
+ -0.002125f,
+ 0.000993f,
+ 0.012768f,
+ 0.03541421f - 2.0f,
+ },
+ {
+ -0.006069f,
+ -0.007335f,
+ 0.000229f,
+ 0.026104f,
+ 0.17135315f - 1.5f,
+ },
+ {
+ -0.039894f,
+ -0.011419f,
+ 0.000070f,
+ 0.061817f,
+ 0.6739977f - 1.5f,
+ },
+ },
+ };
+#undef FEATURES
+#undef Q_CTX
+#undef RESOLUTION_CTX
+
+// Rectangular partition search pruning model.
+#define FEATURES 8
+#define LABELS 4
+#define NODES 16
+static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = {
+ -0.432522f, 0.133070f, -0.169187f, 0.768340f, 0.891228f, 0.554458f,
+ 0.356000f, 0.403621f, 0.809165f, 0.778214f, -0.520357f, 0.301451f,
+ -0.386972f, -0.314402f, 0.021878f, 1.148746f, -0.462258f, -0.175524f,
+ -0.344589f, -0.475159f, -0.232322f, 0.471147f, -0.489948f, 0.467740f,
+ -0.391550f, 0.208601f, 0.054138f, 0.076859f, -0.309497f, -0.095927f,
+ 0.225917f, 0.011582f, -0.520730f, -0.585497f, 0.174036f, 0.072521f,
+ 0.120771f, -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f,
+ 0.290584f, 0.038373f, 0.685654f, 0.394019f, 0.759667f, 1.257502f,
+ -0.610516f, -0.185434f, 0.211997f, -0.172458f, 0.044605f, 0.145316f,
+ -0.182525f, -0.147376f, 0.578742f, 0.312412f, -0.446135f, -0.389112f,
+ 0.454033f, 0.260490f, 0.664285f, 0.395856f, -0.231827f, 0.215228f,
+ 0.014856f, -0.395462f, 0.479646f, -0.391445f, -0.357788f, 0.166238f,
+ -0.056818f, -0.027783f, 0.060880f, -1.604710f, 0.531268f, 0.282184f,
+ 0.714944f, 0.093523f, -0.218312f, -0.095546f, -0.285621f, -0.190871f,
+ -0.448340f, -0.016611f, 0.413913f, -0.286720f, -0.158828f, -0.092635f,
+ -0.279551f, 0.166509f, -0.088162f, 0.446543f, -0.276830f, -0.065642f,
+ -0.176346f, -0.984754f, 0.338738f, 0.403809f, 0.738065f, 1.154439f,
+ 0.750764f, 0.770959f, -0.269403f, 0.295651f, -0.331858f, 0.367144f,
+ 0.279279f, 0.157419f, -0.348227f, -0.168608f, -0.956000f, -0.647136f,
+ 0.250516f, 0.858084f, 0.809802f, 0.492408f, 0.804841f, 0.282802f,
+ 0.079395f, -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f,
+ -0.483044f, 0.141126f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer0[NODES] = {
+ 0.275384f, -0.053745f, 0.000000f, 0.000000f, -0.178103f, 0.513965f,
+ -0.161352f, 0.228551f, 0.000000f, 1.013712f, 0.000000f, 0.000000f,
+ -1.144009f, -0.000006f, -0.241727f, 2.048764f,
+};
+
+static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = {
+ -1.435278f, 2.204691f, -0.410718f, 0.202708f, 0.109208f, 1.059142f,
+ -0.306360f, 0.845906f, 0.489654f, -1.121915f, -0.169133f, -0.003385f,
+ 0.660590f, -0.018711f, 1.227158f, -2.967504f, 1.407345f, -1.293243f,
+ -0.386921f, 0.300492f, 0.338824f, -0.083250f, -0.069454f, -1.001827f,
+ -0.327891f, 0.899353f, 0.367397f, -0.118601f, -0.171936f, -0.420646f,
+ -0.803319f, 2.029634f, 0.940268f, -0.664484f, 0.339916f, 0.315944f,
+ 0.157374f, -0.402482f, -0.491695f, 0.595827f, 0.015031f, 0.255887f,
+ -0.466327f, -0.212598f, 0.136485f, 0.033363f, -0.796921f, 1.414304f,
+ -0.282185f, -2.673571f, -0.280994f, 0.382658f, -0.350902f, 0.227926f,
+ 0.062602f, -1.000199f, 0.433731f, 1.176439f, -0.163216f, -0.229015f,
+ -0.640098f, -0.438852f, -0.947700f, 2.203434f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = {
+ -0.875510f,
+ 0.982408f,
+ 0.560854f,
+ -0.415209f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_16 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_rect_part_nn_weights_16_layer0,
+ vp9_rect_part_nn_weights_16_layer1,
+ },
+ {
+ vp9_rect_part_nn_bias_16_layer0,
+ vp9_rect_part_nn_bias_16_layer1,
+ },
+};
+
+static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = {
+ -0.147312f, -0.753248f, 0.540206f, 0.661415f, 0.484117f, -0.341609f,
+ 0.016183f, 0.064177f, 0.781580f, 0.902232f, -0.505342f, 0.325183f,
+ -0.231072f, -0.120107f, -0.076216f, 0.120038f, 0.403695f, -0.463301f,
+ -0.192158f, 0.407442f, 0.106633f, 1.072371f, -0.446779f, 0.467353f,
+ 0.318812f, -0.505996f, -0.008768f, -0.239598f, 0.085480f, 0.284640f,
+ -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f, -0.228809f,
+ 0.383651f, -0.196882f, 0.477039f, -0.217978f, -0.506931f, -0.125675f,
+ 0.050456f, 1.086598f, 0.732128f, 0.326941f, 0.103952f, 0.121769f,
+ -0.154487f, -0.255514f, 0.030591f, -0.382797f, -0.019981f, -0.326570f,
+ 0.149691f, -0.435633f, -0.070795f, 0.167691f, 0.251413f, -0.153405f,
+ 0.160347f, 0.455107f, -0.968580f, -0.575879f, 0.623115f, -0.069793f,
+ -0.379768f, -0.965807f, -0.062057f, 0.071312f, 0.457098f, 0.350372f,
+ -0.460659f, -0.985393f, 0.359963f, -0.093677f, 0.404272f, -0.326896f,
+ -0.277752f, 0.609322f, -0.114193f, -0.230701f, 0.089208f, 0.645381f,
+ 0.494485f, 0.467876f, -0.166187f, 0.251044f, -0.394661f, 0.192895f,
+ -0.344777f, -0.041893f, -0.111163f, 0.066347f, 0.378158f, -0.455465f,
+ 0.339839f, -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f,
+ 0.361772f, -0.338095f, 0.004564f, -0.398510f, 0.060876f, -2.132504f,
+ -0.086776f, -0.029166f, 0.039241f, 0.222534f, -0.188565f, -0.288792f,
+ -0.160789f, -0.123905f, 0.397916f, -0.063779f, 0.167210f, -0.445004f,
+ 0.056889f, 0.207280f, 0.000101f, 0.384507f, -1.721239f, -2.036402f,
+ -2.084403f, -2.060483f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer0[NODES] = {
+ -0.859251f, -0.109938f, 0.091838f, 0.187817f, -0.728265f, 0.253080f,
+ 0.000000f, -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f,
+ -0.024504f, 1.765711f, 0.000000f, 1.505390f,
+};
+
+static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = {
+ 0.680940f, 1.367178f, 0.403075f, 0.029957f, 0.500917f, 1.407776f,
+ -0.354002f, 0.011667f, 1.663767f, 0.959155f, 0.428323f, -0.205345f,
+ -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f,
+ 0.128236f, -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f,
+ -1.677389f, -0.868332f, -0.063792f, 0.052052f, 0.359591f, 2.739808f,
+ -0.414304f, 3.036597f, -0.075368f, -1.019680f, 0.642501f, 0.209779f,
+ -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f,
+ 0.068734f, 0.508309f, 0.099334f, 1.802239f, -0.333538f, 2.708645f,
+ -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f,
+ 0.194572f, 0.431788f, -0.789624f, -0.031962f, 0.358353f, 0.382937f,
+ 0.232002f, 2.321813f, -0.037523f, 2.104652f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = {
+ -0.693383f,
+ 0.773661f,
+ 0.426878f,
+ -0.070619f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_32 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_rect_part_nn_weights_32_layer0,
+ vp9_rect_part_nn_weights_32_layer1,
+ },
+ {
+ vp9_rect_part_nn_bias_32_layer0,
+ vp9_rect_part_nn_bias_32_layer1,
+ },
+};
+#undef NODES
+
+#define NODES 24
+static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = {
+ 0.024671f, -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f,
+ 0.139782f, 0.063110f, 0.796561f, 0.172868f, -0.662194f, -1.393074f,
+ 0.085003f, 0.393381f, 0.358477f, -0.187268f, -0.370745f, 0.218287f,
+ 0.027271f, -0.254089f, -0.048236f, -0.459137f, 0.253171f, 0.122598f,
+ -0.550107f, -0.568456f, 0.159866f, -0.246534f, 0.096384f, -0.255460f,
+ 0.077864f, -0.334837f, 0.026921f, -0.697252f, 0.345262f, 1.343578f,
+ 0.815984f, 1.118211f, 1.574016f, 0.578476f, -0.285967f, -0.508672f,
+ 0.118137f, 0.037695f, 1.540510f, 1.256648f, 1.163819f, 1.172027f,
+ 0.661551f, -0.111980f, -0.434204f, -0.894217f, 0.570524f, 0.050292f,
+ -0.113680f, 0.000784f, -0.211554f, -0.369394f, 0.158306f, -0.512505f,
+ -0.238696f, 0.091498f, -0.448490f, -0.491268f, -0.353112f, -0.303315f,
+ -0.428438f, 0.127998f, -0.406790f, -0.401786f, -0.279888f, -0.384223f,
+ 0.026100f, 0.041621f, -0.315818f, -0.087888f, 0.353497f, 0.163123f,
+ -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f,
+ 0.070854f, 0.114627f, -0.050545f, -0.160381f, 0.595294f, 0.492696f,
+ -0.453858f, -1.154139f, 0.126000f, 0.034550f, 0.456665f, -0.236618f,
+ -0.112640f, 0.050759f, -0.449162f, 0.110059f, 0.147116f, 0.249358f,
+ -0.049894f, 0.063351f, -0.004467f, 0.057242f, -0.482015f, -0.174335f,
+ -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f, -1.243430f,
+ -0.052963f, 0.112088f, -2.661115f, -2.445893f, -2.688174f, -2.624232f,
+ 0.030494f, 0.161311f, 0.012136f, 0.207564f, -2.776856f, -2.791940f,
+ -2.623962f, -2.918820f, 1.231619f, -0.376692f, -0.698078f, 0.110336f,
+ -0.285378f, 0.258367f, -0.180159f, -0.376608f, -0.034348f, -0.130206f,
+ 0.160020f, 0.852977f, 0.580573f, 1.450782f, 1.357596f, 0.787382f,
+ -0.544004f, -0.014795f, 0.032121f, -0.557696f, 0.159994f, -0.540908f,
+ 0.180380f, -0.398045f, 0.705095f, 0.515103f, -0.511521f, -1.271374f,
+ -0.231019f, 0.423647f, 0.064907f, -0.255338f, -0.877748f, -0.667205f,
+ 0.267847f, 0.135229f, 0.617844f, 1.349849f, 1.012623f, 0.730506f,
+ -0.078571f, 0.058401f, 0.053221f, -2.426146f, -0.098808f, -0.138508f,
+ -0.153299f, 0.149116f, -0.444243f, 0.301807f, 0.065066f, 0.092929f,
+ -0.372784f, -0.095540f, 0.192269f, 0.237894f, 0.080228f, -0.214074f,
+ -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer0[NODES] = {
+ 0.000000f, -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f,
+ -0.076307f, -0.193803f, 0.000000f, -0.066474f, -0.050318f, -0.019832f,
+ -0.038814f, -0.144184f, 2.652451f, 2.415006f, 0.197464f, -0.729842f,
+ -0.173774f, 0.239171f, 0.486425f, 2.463304f, -0.175279f, 2.352637f,
+};
+
+static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = {
+ -0.063237f, 1.925696f, -0.182145f, -0.226687f, 0.602941f, -0.941140f,
+ 0.814598f, -0.117063f, 0.282988f, 0.066369f, 0.096951f, 1.049735f,
+ -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f, 0.417145f,
+ -0.279849f, 1.335945f, 0.660338f, -2.757938f, -0.115714f, -1.862183f,
+ -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f,
+ -0.167290f, 0.141290f, -0.112100f, 0.232761f, 0.252307f, -0.399653f,
+ 0.353118f, 0.241583f, 2.635241f, 4.026119f, -1.137327f, -0.052446f,
+ -0.139814f, -1.104256f, -0.759391f, 2.508457f, -0.526297f, 2.095348f,
+ -0.444473f, -1.090452f, 0.584122f, 0.468729f, -0.368865f, 1.041425f,
+ -1.079504f, 0.348837f, 0.390091f, 0.416191f, 0.212906f, -0.660255f,
+ 0.053630f, 0.209476f, 3.595525f, 2.257293f, -0.514030f, 0.074203f,
+ -0.375862f, -1.998307f, -0.930310f, 1.866686f, -0.247137f, 1.087789f,
+ 0.100186f, 0.298150f, 0.165265f, 0.050478f, 0.249167f, 0.371789f,
+ -0.294497f, 0.202954f, 0.037310f, 0.193159f, 0.161551f, 0.301597f,
+ 0.299286f, 0.185946f, 0.822976f, 2.066130f, -1.724588f, 0.055977f,
+ -0.330747f, -0.067747f, -0.475801f, 1.555958f, -0.025808f, -0.081516f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = {
+ -0.090723f,
+ 0.894968f,
+ 0.844754f,
+ -3.496194f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_64 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_rect_part_nn_weights_64_layer0,
+ vp9_rect_part_nn_weights_64_layer1,
+ },
+ {
+ vp9_rect_part_nn_bias_64_layer0,
+ vp9_rect_part_nn_bias_64_layer1,
+ },
+};
+#undef FEATURES
+#undef LABELS
+#undef NODES
+
+#define FEATURES 7
+// Partition pruning model(neural nets).
+static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = {
+ -3.571348f, 0.014835f, -3.255393f, -0.098090f, -0.013120f, 0.000221f,
+ 0.056273f, 0.190179f, -0.268130f, -1.828242f, -0.010655f, 0.937244f,
+ -0.435120f, 0.512125f, 1.610679f, 0.190816f, -0.799075f, -0.377348f,
+ -0.144232f, 0.614383f, -0.980388f, 1.754150f, -0.185603f, -0.061854f,
+ -0.807172f, 1.240177f, 1.419531f, -0.438544f, -5.980774f, 0.139045f,
+ -0.032359f, -0.068887f, -1.237918f, 0.115706f, 0.003164f, 2.924212f,
+ 1.246838f, -0.035833f, 0.810011f, -0.805894f, 0.010966f, 0.076463f,
+ -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f,
+ -0.168961f, -3.326450f, -2.731094f, 0.002518f, 0.018840f, -1.656815f,
+ 0.068039f, 0.010586f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer0[8] = {
+ -3.469882f, 0.683989f, 0.194010f, 0.313782f,
+ -3.153335f, 2.245849f, -1.946190f, -3.740020f,
+};
+
+static const float vp9_partition_nn_weights_64x64_layer1[8] = {
+ -8.058566f, 0.108306f, -0.280620f, -0.818823f,
+ -6.445117f, 0.865364f, -1.127127f, -8.808660f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer1[1] = {
+ 6.46909416f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_64x64 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_partition_nn_weights_64x64_layer0,
+ vp9_partition_nn_weights_64x64_layer1,
+ },
+ {
+ vp9_partition_nn_bias_64x64_layer0,
+ vp9_partition_nn_bias_64x64_layer1,
+ },
+};
+
+static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = {
+ -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f, 0.027221f,
+ -0.039137f, -0.907724f, -3.151662f, 0.007106f, 0.018726f, -0.534928f,
+ 0.022744f, 0.000159f, -1.717189f, -3.229031f, -0.027311f, 0.269863f,
+ -0.400747f, -0.394366f, -0.108878f, 0.603027f, 0.455369f, -0.197170f,
+ 1.241746f, -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f,
+ -0.138347f, -0.030754f, -0.200774f, 0.453795f, 0.055625f, -3.163116f,
+ -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f, -0.036439f,
+ -0.801228f, 0.313409f, -0.159942f, 0.031267f, 0.886454f, -1.531644f,
+ -0.089655f, 0.037683f, -0.163441f, -0.130454f, -0.058344f, 0.060011f,
+ 0.275387f, 1.552226f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer0[8] = {
+ -0.838372f, -2.609089f, -0.055763f, 1.329485f,
+ -1.297638f, -2.636622f, -0.826909f, 1.012644f,
+};
+
+static const float vp9_partition_nn_weights_32x32_layer1[8] = {
+ -1.792632f, -7.322353f, -0.683386f, 0.676564f,
+ -1.488118f, -7.527719f, 1.240163f, 0.614309f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer1[1] = {
+ 4.97422546f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_32x32 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_partition_nn_weights_32x32_layer0,
+ vp9_partition_nn_weights_32x32_layer1,
+ },
+ {
+ vp9_partition_nn_bias_32x32_layer0,
+ vp9_partition_nn_bias_32x32_layer1,
+ },
+};
+
+static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = {
+ -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f,
+ 0.130891f, -3.096753f, 0.174968f, -0.188769f, -0.640796f, 1.305661f,
+ 1.700638f, -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f,
+ -0.148617f, 0.172733f, -0.018619f, 2.152595f, 0.778405f, -0.156455f,
+ 0.612995f, -0.467878f, 0.152022f, -0.236183f, 0.339635f, -0.087119f,
+ -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f, -0.793705f,
+ -6.399260f, 0.010624f, -0.064199f, -0.650621f, 0.338087f, -0.001531f,
+ 1.023655f, -3.700272f, -0.055281f, -0.386884f, 0.375504f, -0.898678f,
+ 0.281156f, -0.314611f, 0.863354f, -0.040582f, -0.145019f, 0.029329f,
+ -2.197880f, -0.108733f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer0[8] = {
+ 0.411516f, -2.143737f, -3.693192f, 2.123142f,
+ -1.356910f, -3.561016f, -0.765045f, -2.417082f,
+};
+
+static const float vp9_partition_nn_weights_16x16_layer1[8] = {
+ -0.619755f, -2.202391f, -4.337171f, 0.611319f,
+ 0.377677f, -4.998723f, -1.052235f, 1.949922f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer1[1] = {
+ 3.20981717f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_16x16 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_partition_nn_weights_16x16_layer0,
+ vp9_partition_nn_weights_16x16_layer1,
+ },
+ {
+ vp9_partition_nn_bias_16x16_layer0,
+ vp9_partition_nn_bias_16x16_layer1,
+ },
+};
+#undef FEATURES
+
+#define FEATURES 6
+static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+ -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f,
+ -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f,
+ -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f,
+ -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f,
+ 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f,
+ -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f,
+ -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f,
+ -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer0[8] = {
+ -0.044396f, -0.938166f, 0.000000f, -0.916375f,
+ 1.242299f, 0.000000f, -0.405734f, 0.014206f,
+};
+
+static const float vp9_var_part_nn_weights_64_layer1[8] = {
+ 1.635945f, 0.979557f, 0.455315f, 1.197199f,
+ -2.251024f, -0.464953f, 1.378676f, -0.111927f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer1[1] = {
+ -0.37972447f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_64 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_var_part_nn_weights_64_layer0,
+ vp9_var_part_nn_weights_64_layer1,
+ },
+ {
+ vp9_var_part_nn_bias_64_layer0,
+ vp9_var_part_nn_bias_64_layer1,
+ },
+};
+
+static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+ 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f,
+ 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f,
+ -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f,
+ 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f,
+ -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f,
+ 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f,
+ -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f,
+ 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer0[8] = {
+ 0.368242f, 0.736617f, 0.000000f, 0.757287f,
+ 0.000000f, 0.613248f, -0.776390f, 0.928497f,
+};
+
+static const float vp9_var_part_nn_weights_32_layer1[8] = {
+ 0.939884f, -2.420850f, -0.410489f, -0.186690f,
+ 0.063287f, -0.522011f, 0.484527f, -0.639625f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer1[1] = {
+ -0.6455006f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_32 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_var_part_nn_weights_32_layer0,
+ vp9_var_part_nn_weights_32_layer1,
+ },
+ {
+ vp9_var_part_nn_bias_32_layer0,
+ vp9_var_part_nn_bias_32_layer1,
+ },
+};
+
+static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+ 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f,
+ -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f,
+ -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f,
+ 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f,
+ 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f,
+ -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f,
+ -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f,
+ 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer0[8] = {
+ 2.901234f, -1.940932f, -0.198970f, -0.406524f,
+ 0.059422f, -1.879207f, -0.232340f, 2.979821f,
+};
+
+static const float vp9_var_part_nn_weights_16_layer1[8] = {
+ -0.528731f, 0.375234f, -0.088422f, 0.668629f,
+ 0.870449f, 0.578735f, 0.546103f, -1.957207f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer1[1] = {
+ -1.95769405f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_16 = {
+ FEATURES, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ {
+ vp9_var_part_nn_weights_16_layer0,
+ vp9_var_part_nn_weights_16_layer1,
+ },
+ {
+ vp9_var_part_nn_bias_16_layer0,
+ vp9_var_part_nn_bias_16_layer1,
+ },
+};
+#undef FEATURES
+
+#define FEATURES 12
+#define LABELS 1
+#define NODES 8
+static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = {
+ -0.609728f, -0.409099f, -0.472449f, 0.183769f, -0.457740f, 0.081089f,
+ 0.171003f, 0.578696f, -0.019043f, -0.856142f, 0.557369f, -1.779424f,
+ -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f,
+ 0.200620f, 0.038013f, -0.430093f, 0.235083f, -0.487442f, 0.424814f,
+ -0.232758f, -0.442943f, 0.229397f, -0.540301f, -0.648421f, -0.649747f,
+ -0.171638f, 0.603824f, 0.468497f, -0.421580f, 0.178840f, -0.533838f,
+ -0.029471f, -0.076296f, 0.197426f, -0.187908f, -0.003950f, -0.065740f,
+ 0.085165f, -0.039674f, -5.640702f, 1.909538f, -1.434604f, 3.294606f,
+ -0.788812f, 0.196864f, 0.057012f, -0.019757f, 0.336233f, 0.075378f,
+ 0.081503f, 0.491864f, -1.899470f, -1.764173f, -1.888137f, -1.762343f,
+ 0.845542f, 0.202285f, 0.381948f, -0.150996f, 0.556893f, -0.305354f,
+ 0.561482f, -0.021974f, -0.703117f, 0.268638f, -0.665736f, 1.191005f,
+ -0.081568f, -0.115653f, 0.272029f, -0.140074f, 0.072683f, 0.092651f,
+ -0.472287f, -0.055790f, -0.434425f, 0.352055f, 0.048246f, 0.372865f,
+ 0.111499f, -0.338304f, 0.739133f, 0.156519f, -0.594644f, 0.137295f,
+ 0.613350f, -0.165102f, -1.003731f, 0.043070f, -0.887896f, -0.174202f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer0[NODES] = {
+ 1.182714f, 0.000000f, 0.902019f, 0.953115f,
+ -1.372486f, -1.288740f, -0.155144f, -3.041362f,
+};
+
+static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = {
+ 0.841214f, 0.456016f, 0.869270f, 1.692999f,
+ -1.700494f, -0.911761f, 0.030111f, -1.447548f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer1[LABELS] = {
+ 1.17782545f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_64 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_part_split_nn_weights_64_layer0,
+ vp9_part_split_nn_weights_64_layer1,
+ },
+ {
+ vp9_part_split_nn_bias_64_layer0,
+ vp9_part_split_nn_bias_64_layer1,
+ },
+};
+
+static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = {
+ -0.105488f, -0.218662f, 0.010980f, -0.226979f, 0.028076f, 0.743430f,
+ 0.789266f, 0.031907f, -1.464200f, 0.222336f, -1.068493f, -0.052712f,
+ -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f,
+ 0.271346f, 0.133005f, 1.674203f, 0.689567f, 0.657133f, 0.283524f,
+ 0.115529f, 0.738327f, 0.317184f, -0.179736f, 0.403691f, 0.679350f,
+ 0.048925f, 0.271338f, -1.538921f, -0.900737f, -1.377845f, 0.084245f,
+ 0.803122f, -0.107806f, 0.103045f, -0.023335f, -0.098116f, -0.127809f,
+ 0.037665f, -0.523225f, 1.622185f, 1.903999f, 1.358889f, 1.680785f,
+ 0.027743f, 0.117906f, -0.158810f, 0.057775f, 0.168257f, 0.062414f,
+ 0.086228f, -0.087381f, -3.066082f, 3.021855f, -4.092155f, 2.550104f,
+ -0.230022f, -0.207445f, -0.000347f, 0.034042f, 0.097057f, 0.220088f,
+ -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f, 2.643355f,
+ 0.319912f, 0.585531f, -1.018225f, -0.699606f, 1.026490f, 0.169952f,
+ -0.093579f, -0.142352f, -0.107256f, 0.059598f, 0.043190f, 0.507543f,
+ -0.138617f, 0.030197f, 0.059574f, -0.634051f, -0.586724f, -0.148020f,
+ -0.334380f, 0.459547f, 1.620600f, 0.496850f, 0.639480f, -0.465715f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer0[NODES] = {
+ -1.125885f, 0.753197f, -0.825808f, 0.004839f,
+ 0.583920f, 0.718062f, 0.976741f, 0.796188f,
+};
+
+static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = {
+ -0.458745f, 0.724624f, -0.479720f, -2.199872f,
+ 1.162661f, 1.194153f, -0.716896f, 0.824080f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer1[LABELS] = {
+ 0.71644074f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_32 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_part_split_nn_weights_32_layer0,
+ vp9_part_split_nn_weights_32_layer1,
+ },
+ {
+ vp9_part_split_nn_bias_32_layer0,
+ vp9_part_split_nn_bias_32_layer1,
+ },
+};
+
+static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = {
+ -0.003629f, -0.046852f, 0.220428f, -0.033042f, 0.049365f, 0.112818f,
+ -0.306149f, -0.005872f, 1.066947f, -2.290226f, 2.159505f, -0.618714f,
+ -0.213294f, 0.451372f, -0.199459f, 0.223730f, -0.321709f, 0.063364f,
+ 0.148704f, -0.293371f, 0.077225f, -0.421947f, -0.515543f, -0.240975f,
+ -0.418516f, 1.036523f, -0.009165f, 0.032484f, 1.086549f, 0.220322f,
+ -0.247585f, -0.221232f, -0.225050f, 0.993051f, 0.285907f, 1.308846f,
+ 0.707456f, 0.335152f, 0.234556f, 0.264590f, -0.078033f, 0.542226f,
+ 0.057777f, 0.163471f, 0.039245f, -0.725960f, 0.963780f, -0.972001f,
+ 0.252237f, -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f,
+ -0.621108f, 0.486405f, -0.221923f, 1.519426f, -0.857871f, 0.411595f,
+ 0.947188f, 0.203339f, 0.174526f, 0.016382f, 0.256879f, 0.049818f,
+ 0.057836f, -0.659096f, 0.459894f, 0.174695f, 0.379359f, 0.062530f,
+ -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f,
+ -0.109342f, 0.002455f, -0.078746f, -0.391871f, 0.149892f, -0.239615f,
+ -0.520709f, 0.118568f, -0.437975f, 0.118116f, -0.565426f, -0.206446f,
+ 0.113407f, 0.558894f, 0.534627f, 1.154350f, -0.116833f, 1.723311f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer0[NODES] = {
+ 0.013109f, -0.034341f, 0.679845f, -0.035781f,
+ -0.104183f, 0.098055f, -0.041130f, 0.160107f,
+};
+
+static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = {
+ 1.499564f, -0.403259f, 1.366532f, -0.469868f,
+ 0.482227f, -2.076697f, 0.527691f, 0.540495f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer1[LABELS] = {
+ 0.01134653f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_16 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_part_split_nn_weights_16_layer0,
+ vp9_part_split_nn_weights_16_layer1,
+ },
+ {
+ vp9_part_split_nn_bias_16_layer0,
+ vp9_part_split_nn_bias_16_layer1,
+ },
+};
+
+static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = {
+ -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f,
+ -0.589145f, 0.203704f, -0.051007f, -0.113769f, -0.477511f, -0.122603f,
+ -1.329890f, 1.403386f, 0.199636f, -0.161139f, 2.182090f, -0.014307f,
+ 0.015755f, -0.208468f, 0.884353f, 0.815920f, 0.632464f, 0.838225f,
+ 1.369483f, -0.029068f, 0.570213f, -0.573546f, 0.029617f, 0.562054f,
+ -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f,
+ 0.173047f, -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f,
+ -0.406704f, 0.411230f, -0.144023f, 0.745651f, -0.595091f, 0.111787f,
+ 0.840651f, 0.030123f, -0.242155f, 0.101486f, -0.017889f, -0.254467f,
+ -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f,
+ 1.623949f, -0.286369f, 0.170976f, 0.016442f, -0.598353f, -0.038540f,
+ 0.202597f, -0.933582f, 0.599510f, 0.362273f, 0.577722f, 0.477603f,
+ 0.767097f, 0.431532f, 0.457034f, 0.223279f, 0.381349f, 0.033777f,
+ 0.423923f, -0.664762f, 0.385662f, 0.075744f, 0.182681f, 0.024118f,
+ 0.319408f, -0.528864f, 0.976537f, -0.305971f, -0.189380f, -0.241689f,
+ -1.318092f, 0.088647f, -0.109030f, -0.945654f, 1.082797f, 0.184564f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer0[NODES] = {
+ -0.237472f, 2.051396f, 0.297062f, -0.730194f,
+ 0.060472f, -0.565959f, 0.560869f, -0.395448f,
+};
+
+static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = {
+ 0.568121f, 1.575915f, -0.544309f, 0.751595f,
+ -0.117911f, -1.340730f, -0.739671f, 0.661216f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer1[LABELS] = {
+ -0.63375306f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_8 = {
+ FEATURES, // num_inputs
+ LABELS, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NODES,
+ }, // num_hidden_nodes
+ {
+ vp9_part_split_nn_weights_8_layer0,
+ vp9_part_split_nn_weights_8_layer1,
+ },
+ {
+ vp9_part_split_nn_bias_8_layer0,
+ vp9_part_split_nn_bias_8_layer1,
+ },
+};
+#undef NODES
+#undef FEATURES
+#undef LABELS
+
+// Partition pruning model(linear).
+static const float vp9_partition_feature_mean[24] = {
+ 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f,
+ 689.413511f, 162.027012f, 1.478213f, 0.0,
+ 135382.260230f, 912738.513263f, 28.845217f, 1.515230f,
+ 544.158492f, 131.807995f, 1.436863f, 0.0f,
+ 43682.377587f, 208131.711766f, 28.084737f, 1.356677f,
+ 138.254122f, 119.522553f, 1.252322f, 0.0f,
+};
+
+static const float vp9_partition_feature_std[24] = {
+ 673689.212982f, 5996652.516628f, 0.024449f, 1.989792f,
+ 985.880847f, 0.014638f, 2.001898f, 0.0f,
+ 208798.775332f, 1812548.443284f, 0.018693f, 1.838009f,
+ 396.986910f, 0.015657f, 1.332541f, 0.0f,
+ 55888.847031f, 448587.962714f, 0.017900f, 1.904776f,
+ 98.652832f, 0.016598f, 1.320992f, 0.0f,
+};
+
+// Error tolerance: 0.01%-0.0.05%-0.1%
+static const float vp9_partition_linear_weights[24] = {
+ 0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f,
+ 0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f,
+ 0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f,
+ 0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f,
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c
new file mode 100644
index 0000000000..3a620df693
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_picklpf.h"
+#include "vp9/encoder/vp9_quantize.h"
+
+static unsigned int get_section_intra_rating(const VP9_COMP *cpi) {
+ unsigned int section_intra_rating;
+
+ section_intra_rating = (cpi->common.frame_type == KEY_FRAME)
+ ? cpi->twopass.key_frame_section_intra_rating
+ : cpi->twopass.section_intra_rating;
+
+ return section_intra_rating;
+}
+
+static int get_max_filter_level(const VP9_COMP *cpi) {
+ if (cpi->oxcf.pass == 2) {
+ unsigned int section_intra_rating = get_section_intra_rating(cpi);
+ return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER;
+ } else {
+ return MAX_LOOP_FILTER;
+ }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ VP9_COMP *const cpi, int filt_level,
+ int partial_frame) {
+ VP9_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+
+ vp9_build_mask_frame(cm, filt_level, partial_frame);
+
+ if (cpi->num_workers > 1)
+ vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+ filt_level, 1, partial_frame, cpi->workers,
+ cpi->num_workers, &cpi->lf_row_sync);
+ else
+ vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+ 1, partial_frame);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) {
+ filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
+ } else {
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
+ }
+#else
+ filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Re-instate the unfiltered frame
+ vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+ return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+ int partial_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const struct loopfilter *const lf = &cm->lf;
+ const int min_filter_level = 0;
+ const int max_filter_level = get_max_filter_level(cpi);
+ int filt_direction = 0;
+ int64_t best_err;
+ int filt_best;
+
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
+ int filt_mid = clamp(lf->last_filt_level, min_filter_level, max_filter_level);
+ int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
+ unsigned int section_intra_rating = get_section_intra_rating(cpi);
+
+ // Set each entry to -1
+ memset(ss_err, 0xFF, sizeof(ss_err));
+
+ // Make a copy of the unfiltered / processed recon buffer
+ vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+
+ best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame);
+ filt_best = filt_mid;
+ ss_err[filt_mid] = best_err;
+
+ while (filter_step > 0) {
+ const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+ const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
+
+ // Bias against raising loop filter in favor of lowering it.
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20))
+ bias = (bias * section_intra_rating) / 20;
+
+ // yx, bias less for large block size
+ if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+ if (filt_direction <= 0 && filt_low != filt_mid) {
+ // Get Low filter error score
+ if (ss_err[filt_low] < 0) {
+ ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
+ }
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
+ if ((ss_err[filt_low] - bias) < best_err) {
+ // Was it actually better than the previous best?
+ if (ss_err[filt_low] < best_err) best_err = ss_err[filt_low];
+
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if (filt_direction >= 0 && filt_high != filt_mid) {
+ if (ss_err[filt_high] < 0) {
+ ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
+ }
+ // Was it better than the previous best?
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step /= 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ return filt_best;
+}
+
+void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ VP9_COMMON *const cm = &cpi->common;
+ struct loopfilter *const lf = &cm->lf;
+
+ lf->sharpness_level = 0;
+
+ if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+ lf->filter_level = 0;
+ } else if (method >= LPF_PICK_FROM_Q) {
+ const int min_filter_level = 0;
+ const int max_filter_level = get_max_filter_level(cpi);
+ const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+// These values were determined by linear fitting the result of the
+// searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHBITDEPTH
+ int filt_guess;
+ switch (cm->bit_depth) {
+ case VPX_BITS_8:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+ break;
+ case VPX_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ }
+#else
+ int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+ (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
+ filt_guess = 5 * filt_guess >> 3;
+
+ if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
+ lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+ } else {
+ lf->filter_level =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h
new file mode 100644
index 0000000000..8881b44daa
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_picklpf.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_
+#define VPX_VP9_ENCODER_VP9_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp9/encoder/vp9_encoder.h"
+
+struct yv12_buffer_config;
+struct VP9_COMP;
+
+void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
+ struct VP9_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c
new file mode 100644
index 0000000000..fa88cd79da
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.c
@@ -0,0 +1,2995 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_codec.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_pickmode.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+
+typedef struct {
+ uint8_t *data;
+ int stride;
+ int in_use;
+} PRED_BUFFER;
+
+typedef struct {
+ PRED_BUFFER *best_pred;
+ PREDICTION_MODE best_mode;
+ TX_SIZE best_tx_size;
+ TX_SIZE best_intra_tx_size;
+ MV_REFERENCE_FRAME best_ref_frame;
+ MV_REFERENCE_FRAME best_second_ref_frame;
+ uint8_t best_mode_skip_txfm;
+ INTERP_FILTER best_pred_filter;
+} BEST_PICKMODE;
+
+static const int pos_shift_16x16[4][4] = {
+ { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
+};
+
+static int mv_refs_rt(VP9_COMP *cpi, const VP9_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, const TileInfo *const tile,
+ MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ int_mv *mv_ref_list, int_mv *base_mv, int mi_row,
+ int mi_col, int use_base_mv) {
+ const int *ref_sign_bias = cm->ref_frame_sign_bias;
+ int i, refmv_count = 0;
+
+ const POSITION *const mv_ref_search = mv_ref_blocks[mi->sb_type];
+
+ int different_ref_found = 0;
+ int context_counter = 0;
+ int const_motion = 0;
+
+ // Blank the reference vector list
+ memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+ // The nearest 2 blocks are treated differently
+ // if the size < 8x8 we get the mv from the bmi substructure,
+ // and we also need to keep a mode count.
+ for (i = 0; i < 2; ++i) {
+ const POSITION *const mv_ref = &mv_ref_search[i];
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+ const MODE_INFO *const candidate_mi =
+ xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+ // Keep counts for entropy encoding.
+ context_counter += mode_2_counter[candidate_mi->mode];
+ different_ref_found = 1;
+
+ if (candidate_mi->ref_frame[0] == ref_frame)
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
+ refmv_count, mv_ref_list, Done);
+ }
+ }
+
+ const_motion = 1;
+
+ // Check the rest of the neighbors in much the same way
+ // as before except we don't need to keep track of sub blocks or
+ // mode counts.
+ for (; i < MVREF_NEIGHBOURS && !refmv_count; ++i) {
+ const POSITION *const mv_ref = &mv_ref_search[i];
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+ const MODE_INFO *const candidate_mi =
+ xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+ different_ref_found = 1;
+
+ if (candidate_mi->ref_frame[0] == ref_frame)
+ ADD_MV_REF_LIST(candidate_mi->mv[0], refmv_count, mv_ref_list, Done);
+ }
+ }
+
+ // Since we couldn't find 2 mvs from the same reference frame
+ // go back through the neighbors and find motion vectors from
+ // different reference frames.
+ if (different_ref_found && !refmv_count) {
+ for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+ const POSITION *mv_ref = &mv_ref_search[i];
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+ const MODE_INFO *const candidate_mi =
+ xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+
+ // If the candidate is INTRA we don't want to consider its mv.
+ IF_DIFF_REF_FRAME_ADD_MV(candidate_mi, ref_frame, ref_sign_bias,
+ refmv_count, mv_ref_list, Done);
+ }
+ }
+ }
+ if (use_base_mv &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+ ref_frame == LAST_FRAME) {
+ // Get base layer mv.
+ MV_REF *candidate =
+ &cm->prev_frame
+ ->mvs[(mi_col >> 1) + (mi_row >> 1) * (cm->mi_cols >> 1)];
+ if (candidate->mv[0].as_int != INVALID_MV) {
+ base_mv->as_mv.row = (candidate->mv[0].as_mv.row * 2);
+ base_mv->as_mv.col = (candidate->mv[0].as_mv.col * 2);
+ clamp_mv_ref(&base_mv->as_mv, xd);
+ } else {
+ base_mv->as_int = INVALID_MV;
+ }
+ }
+
+Done:
+
+ x->mbmi_ext->mode_context[ref_frame] = counter_to_context[context_counter];
+
+ // Clamp vectors
+ for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
+ clamp_mv_ref(&mv_ref_list[i].as_mv, xd);
+
+ return const_motion;
+}
+
+static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *tmp_mv, int *rate_mv,
+ int64_t best_rd_sofar, int use_base_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
+ const int step_param = cpi->sf.mv.fullpel_search_step_param;
+ const int sadpb = x->sadperbit16;
+ MV mvp_full;
+ const int ref = mi->ref_frame[0];
+ const MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+ MV center_mv;
+ uint32_t dis;
+ int rate_mode;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ int rv = 0;
+ int cost_list[5];
+ int search_subpel = 1;
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ vp9_get_scaled_ref_frame(cpi, ref);
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+ vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+ // Limit motion vector for large lightning change.
+ if (cpi->oxcf.speed > 5 && x->lowvar_highsumdiff) {
+ x->mv_limits.col_min = VPXMAX(x->mv_limits.col_min, -10);
+ x->mv_limits.row_min = VPXMAX(x->mv_limits.row_min, -10);
+ x->mv_limits.col_max = VPXMIN(x->mv_limits.col_max, 10);
+ x->mv_limits.row_max = VPXMIN(x->mv_limits.row_max, 10);
+ }
+
+ assert(x->mv_best_ref_index[ref] <= 2);
+ if (x->mv_best_ref_index[ref] < 2)
+ mvp_full = x->mbmi_ext->ref_mvs[ref][x->mv_best_ref_index[ref]].as_mv;
+ else
+ mvp_full = x->pred_mv[ref];
+
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+
+ if (!use_base_mv)
+ center_mv = ref_mv;
+ else
+ center_mv = tmp_mv->as_mv;
+
+ if (x->sb_use_mv_part) {
+ tmp_mv->as_mv.row = x->sb_mvrow_part >> 3;
+ tmp_mv->as_mv.col = x->sb_mvcol_part >> 3;
+ } else {
+ vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+ cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+ }
+
+ x->mv_limits = tmp_mv_limits;
+
+ // calculate the bit cost on motion vector
+ mvp_full.row = tmp_mv->as_mv.row * 8;
+ mvp_full.col = tmp_mv->as_mv.col * 8;
+
+ *rate_mv = vp9_mv_bit_cost(&mvp_full, &ref_mv, x->nmvjointcost, x->mvcost,
+ MV_COST_WEIGHT);
+
+ rate_mode =
+ cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref]][INTER_OFFSET(NEWMV)];
+ rv =
+ !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
+
+ // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+ if (cpi->use_svc && cpi->svc.non_reference_frame) {
+ if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+ }
+
+ if (rv && search_subpel) {
+ SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+ if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL;
+ if (cpi->sf.mv.enable_adaptive_subpel_force_stop) {
+ const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh;
+ if (abs(tmp_mv->as_mv.row) >= mv_thresh ||
+ abs(tmp_mv->as_mv.col) >= mv_thresh)
+ subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above;
+ else
+ subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below;
+ }
+ cpi->find_fractional_mv_step(
+ x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
+ cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+ }
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return rv;
+}
+
+static void block_variance(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride, int w, int h,
+ unsigned int *sse, int *sum, int block_size,
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth, vpx_bit_depth_t bd,
+#endif
+ uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
+ int i, j, k = 0;
+ uint32_t k_sqr = 0;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (use_highbitdepth) {
+ switch (bd) {
+ case VPX_BITS_8:
+ vpx_highbd_8_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride,
+ &sse8x8[k], &sum8x8[k]);
+ break;
+ case VPX_BITS_10:
+ vpx_highbd_10_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride,
+ &sse8x8[k], &sum8x8[k]);
+ break;
+ case VPX_BITS_12:
+ vpx_highbd_12_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride,
+ &sse8x8[k], &sum8x8[k]);
+ break;
+ }
+ } else {
+ vpx_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse8x8[k],
+ &sum8x8[k]);
+ }
+#else
+ vpx_get8x8var(src + src_stride * i + j, src_stride,
+ ref + ref_stride * i + j, ref_stride, &sse8x8[k],
+ &sum8x8[k]);
+#endif
+ *sse += sse8x8[k];
+ *sum += sum8x8[k];
+ k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+ var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
+ k++;
+ }
+ }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+ unsigned int *sse_i, int *sum_i,
+ unsigned int *var_o, unsigned int *sse_o,
+ int *sum_o) {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+ const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+ const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+ int i, j, k = 0;
+ uint32_t k_sqr = 0;
+
+ for (i = 0; i < nh; i += 2) {
+ for (j = 0; j < nw; j += 2) {
+ sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+ sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+ sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+ sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+ k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+ (b_width_log2_lookup[unit_size] +
+ b_height_log2_lookup[unit_size] + 6));
+ var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
+ k++;
+ }
+ }
+}
+
+// Adjust the ac_thr according to speed, width, height and normalized sum
+static int ac_thr_factor(const int speed, const int width, const int height,
+ const int norm_sum) {
+ if (speed >= 8 && norm_sum < 5) {
+ if (width <= 640 && height <= 480)
+ return 4;
+ else
+ return 2;
+ }
+ return 1;
+}
+
+static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCKD *const xd, unsigned int var,
+ unsigned int sse, int64_t ac_thr,
+ unsigned int source_variance, int is_intra) {
+ // TODO(marpan): Tune selection for intra-modes, screen content, etc.
+ TX_SIZE tx_size;
+ unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1;
+ int limit_tx = 1;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ (source_variance == 0 || var < var_thresh))
+ limit_tx = 0;
+ if (cpi->common.tx_mode == TX_MODE_SELECT) {
+ if (sse > (var << 2))
+ tx_size = VPXMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ else
+ tx_size = TX_8X8;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx &&
+ cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+ tx_size = TX_8X8;
+ else if (tx_size > TX_16X16 && limit_tx)
+ tx_size = TX_16X16;
+ // For screen-content force 4X4 tx_size over 8X8, for large variance.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 &&
+ bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr))
+ tx_size = TX_4X4;
+ } else {
+ tx_size = VPXMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ }
+ return tx_size;
+}
+
+static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd) {
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ int row, col;
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0
+ ? 0
+ : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0
+ ? 0
+ : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+ p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+ vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode,
+ x->skip_encode ? p->src.buf : pd->dst.buf,
+ x->skip_encode ? src_stride : dst_stride,
+ pd->dst.buf, dst_stride, col, row, 0);
+ }
+ }
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ unsigned int *var_y, unsigned int *sse_y,
+ int mi_row, int mi_col, int *early_term,
+ int *flag_preduv_computed) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ int64_t dc_thr = dc_quant * dc_quant >> 6;
+ int64_t ac_thr = ac_quant * ac_quant >> 6;
+ unsigned int var;
+ int sum;
+ int skip_dc = 0;
+
+ const int bw = b_width_log2_lookup[bsize];
+ const int bh = b_height_log2_lookup[bsize];
+ const int num8x8 = 1 << (bw + bh - 2);
+ unsigned int sse8x8[64] = { 0 };
+ int sum8x8[64] = { 0 };
+ unsigned int var8x8[64] = { 0 };
+ TX_SIZE tx_size;
+ int i, k;
+ uint32_t sum_sqr;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const vpx_bit_depth_t bd = cpi->common.bit_depth;
+#endif
+ // Calculate variance for whole partition, and also save 8x8 blocks' variance
+ // to be used in following transform skipping test.
+ block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ 4 << bw, 4 << bh, &sse, &sum, 8,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cpi->common.use_highbitdepth, bd,
+#endif
+ sse8x8, sum8x8, var8x8);
+ sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+ var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
+
+ *var_y = var;
+ *sse_y = sse;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+ cpi->oxcf.speed > 5)
+ ac_thr = vp9_scale_acskip_thresh(ac_thr, cpi->denoiser.denoising_level,
+ (abs(sum) >> (bw + bh)),
+ cpi->svc.temporal_layer_id);
+ else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+ cpi->common.height, abs(sum) >> (bw + bh));
+#else
+ ac_thr *= ac_thr_factor(cpi->oxcf.speed, cpi->common.width,
+ cpi->common.height, abs(sum) >> (bw + bh));
+#endif
+
+ tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+ x->source_variance, 0);
+ // The code below for setting skip flag assumes tranform size of at least 8x8,
+ // so force this lower limit on transform.
+ if (tx_size < TX_8X8) tx_size = TX_8X8;
+ xd->mi[0]->tx_size = tx_size;
+
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source &&
+ x->source_variance == 0)
+ dc_thr = dc_thr << 1;
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ {
+ unsigned int sse16x16[16] = { 0 };
+ int sum16x16[16] = { 0 };
+ unsigned int var16x16[16] = { 0 };
+ const int num16x16 = num8x8 >> 2;
+
+ unsigned int sse32x32[4] = { 0 };
+ int sum32x32[4] = { 0 };
+ unsigned int var32x32[4] = { 0 };
+ const int num32x32 = num8x8 >> 4;
+
+ int ac_test = 1;
+ int dc_test = 1;
+ const int num = (tx_size == TX_8X8)
+ ? num8x8
+ : ((tx_size == TX_16X16) ? num16x16 : num32x32);
+ const unsigned int *sse_tx =
+ (tx_size == TX_8X8) ? sse8x8
+ : ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+ const unsigned int *var_tx =
+ (tx_size == TX_8X8) ? var8x8
+ : ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+ // Calculate variance if tx_size > TX_8X8
+ if (tx_size >= TX_16X16)
+ calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+ sum16x16);
+ if (tx_size == TX_32X32)
+ calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+ sse32x32, sum32x32);
+
+ // Skipping test
+ x->skip_txfm[0] = SKIP_TXFM_NONE;
+ for (k = 0; k < num; k++)
+ // Check if all ac coefficients can be quantized to zero.
+ if (!(var_tx[k] < ac_thr || var == 0)) {
+ ac_test = 0;
+ break;
+ }
+
+ for (k = 0; k < num; k++)
+ // Check if dc coefficient can be quantized to zero.
+ if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+ dc_test = 0;
+ break;
+ }
+
+ if (ac_test) {
+ x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+
+ if (dc_test) x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+ } else if (dc_test) {
+ skip_dc = 1;
+ }
+ }
+
+ if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+ int skip_uv[2] = { 0 };
+ unsigned int var_uv[2];
+ unsigned int sse_uv[2];
+
+ *out_rate_sum = 0;
+ *out_dist_sum = sse << 4;
+
+ // Transform skipping test in UV planes.
+ for (i = 1; i <= 2; i++) {
+ struct macroblock_plane *const p_uv = &x->plane[i];
+ struct macroblockd_plane *const pd_uv = &xd->plane[i];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
+ const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
+ const int uv_bw = b_width_log2_lookup[uv_bsize];
+ const int uv_bh = b_height_log2_lookup[uv_bsize];
+ const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+ (uv_bh - b_height_log2_lookup[unit_size]);
+ const uint32_t uv_dc_thr =
+ pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+ const uint32_t uv_ac_thr =
+ pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
+ int j = i - 1;
+
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+ flag_preduv_computed[i - 1] = 1;
+ var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+ pd_uv->dst.buf, pd_uv->dst.stride,
+ &sse_uv[j]);
+
+ if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+ (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+ skip_uv[j] = 1;
+ else
+ break;
+ }
+
+ // If the transform in YUV planes are skippable, the mode search checks
+ // fewer inter modes and doesn't check intra modes.
+ if (skip_uv[0] & skip_uv[1]) {
+ *early_term = 1;
+ }
+ return;
+ }
+
+ if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (!skip_dc) {
+ *out_rate_sum = rate >> 1;
+ *out_dist_sum = dist << 3;
+ } else {
+ *out_rate_sum = 0;
+ *out_dist_sum = (sse - var) << 4;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3,
+ &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate;
+ *out_dist_sum += dist << 4;
+}
+
+static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+ MACROBLOCKD *xd, int *out_rate_sum,
+ int64_t *out_dist_sum, unsigned int *var_y,
+ unsigned int *sse_y, int is_intra) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int64_t dc_thr = p->quant_thred[0] >> 6;
+ const int64_t ac_thr = p->quant_thred[1] >> 6;
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ int skip_dc = 0;
+
+ *var_y = var;
+ *sse_y = sse;
+
+ xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+ x->source_variance, is_intra);
+
+ // Evaluate if the partition block is a skippable block in Y plane.
+ {
+ const BLOCK_SIZE unit_size = txsize_to_bsize[xd->mi[0]->tx_size];
+ const unsigned int num_blk_log2 =
+ (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
+ (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
+ const unsigned int sse_tx = sse >> num_blk_log2;
+ const unsigned int var_tx = var >> num_blk_log2;
+
+ x->skip_txfm[0] = SKIP_TXFM_NONE;
+ // Check if all ac coefficients can be quantized to zero.
+ if (var_tx < ac_thr || var == 0) {
+ x->skip_txfm[0] = SKIP_TXFM_AC_ONLY;
+ // Check if dc coefficient can be quantized to zero.
+ if (sse_tx - var_tx < dc_thr || sse == var)
+ x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+ } else {
+ if (sse_tx - var_tx < dc_thr || sse == var) skip_dc = 1;
+ }
+ }
+
+ if (x->skip_txfm[0] == SKIP_TXFM_AC_DC) {
+ *out_rate_sum = 0;
+ *out_dist_sum = sse << 4;
+ return;
+ }
+
+ if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+ dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (!skip_dc) {
+ *out_rate_sum = rate >> 1;
+ *out_dist_sum = dist << 3;
+ } else {
+ *out_rate_sum = 0;
+ *out_dist_sum = (sse - var) << 4;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize], ac_quant >> 3,
+ &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *out_rate_sum += rate;
+ *out_dist_sum += dist << 4;
+}
+
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+ int *skippable, int64_t *sse, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, int rd_computed, int is_intra) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const struct macroblockd_plane *pd = &xd->plane[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+ const int step = 1 << (tx_size << 1);
+ const int block_step = (1 << tx_size);
+ int block = 0, r, c;
+ const int max_blocks_wide =
+ num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >> 5);
+ const int max_blocks_high =
+ num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >> 5);
+ int eob_cost = 0;
+ const int bw = 4 * num_4x4_w;
+ const int bh = 4 * num_4x4_h;
+
+ if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
+ (bsize < BLOCK_32X32 ||
+ (cpi->use_svc &&
+ (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) {
+ unsigned int var_y, sse_y;
+ (void)tx_size;
+ if (!rd_computed)
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+ &var_y, &sse_y, is_intra);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+ }
+
+ (void)cpi;
+
+ // The max tx_size passed in is TX_16X16.
+ assert(tx_size != TX_32X32);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+ p->src.stride, pd->dst.buf, pd->dst.stride,
+ x->e_mbd.bd);
+ } else {
+ vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+ }
+#else
+ vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+#endif
+ *skippable = 1;
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = bw;
+ const int16_t *src_diff;
+ src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+ switch (tx_size) {
+ case TX_16X16:
+ vpx_hadamard_16x16(src_diff, diff_stride, coeff);
+ vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ case TX_8X8:
+ vpx_hadamard_8x8(src_diff, diff_stride, coeff);
+ vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ default:
+ assert(tx_size == TX_4X4);
+ x->fwd_txfm4x4(src_diff, coeff, diff_stride);
+ vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ break;
+ }
+ *skippable &= (*eob == 0);
+ eob_cost += 1;
+ }
+ block += step;
+ }
+ }
+
+ this_rdc->rate = 0;
+ if (*sse < INT64_MAX) {
+ *sse = (*sse << 6) >> 2;
+ if (*skippable) {
+ this_rdc->dist = *sse;
+ return;
+ }
+ }
+
+ block = 0;
+ this_rdc->dist = 0;
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+
+ if (*eob == 1)
+ this_rdc->rate += (int)abs(qcoeff[0]);
+ else if (*eob > 1)
+ this_rdc->rate += vpx_satd(qcoeff, step << 4);
+
+ this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
+ }
+ block += step;
+ }
+ }
+
+ // If skippable is set, rate gets clobbered later.
+ this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
+ this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd,
+ RD_COST *this_rdc, unsigned int *var_y,
+ unsigned int *sse_y, int start_plane,
+ int stop_plane) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ unsigned int sse;
+ int rate;
+ int64_t dist;
+ int i;
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint64_t tot_var = *var_y;
+ uint64_t tot_sse = *sse_y;
+#else
+ uint32_t tot_var = *var_y;
+ uint32_t tot_sse = *sse_y;
+#endif
+
+ this_rdc->rate = 0;
+ this_rdc->dist = 0;
+
+ for (i = start_plane; i <= stop_plane; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const uint32_t dc_quant = pd->dequant[0];
+ const uint32_t ac_quant = pd->dequant[1];
+ const BLOCK_SIZE bs = plane_bsize;
+ unsigned int var;
+ if (!x->color_sensitivity[i - 1]) continue;
+
+ var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, &sse);
+ assert(sse >= var);
+ tot_var += var;
+ tot_sse += sse;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+ dc_quant >> 3, &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ this_rdc->rate += rate >> 1;
+ this_rdc->dist += dist << 3;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+ ac_quant >> (xd->bd - 5), &rate, &dist);
+#else
+ vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs], ac_quant >> 3,
+ &rate, &dist);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ this_rdc->rate += rate;
+ this_rdc->dist += dist << 4;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ *var_y = tot_var > UINT32_MAX ? UINT32_MAX : (uint32_t)tot_var;
+ *sse_y = tot_sse > UINT32_MAX ? UINT32_MAX : (uint32_t)tot_sse;
+#else
+ *var_y = tot_var;
+ *sse_y = tot_sse;
+#endif
+}
+
+static int get_pred_buffer(PRED_BUFFER *p, int len) {
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (!p[i].in_use) {
+ p[i].in_use = 1;
+ return i;
+ }
+ }
+ return -1;
+}
+
+static void free_pred_buffer(PRED_BUFFER *p) {
+ if (p != NULL) p->in_use = 0;
+}
+
+static void encode_breakout_test(
+ VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y,
+ unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate,
+ int64_t *dist, int *flag_preduv_computed) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
+ unsigned int var = var_y, sse = sse_y;
+ // Skipping threshold for ac.
+ unsigned int thresh_ac;
+ // Skipping threshold for dc.
+ unsigned int thresh_dc;
+ int motion_low = 1;
+
+ if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
+ if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
+ mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
+ motion_low = 0;
+ if (x->encode_breakout > 0 && motion_low == 1) {
+ // Set a maximum for threshold to avoid big PSNR loss in low bit rate
+ // case. Use extreme low threshold for static frames to limit
+ // skipping.
+ const unsigned int max_thresh = 36000;
+ // The encode_breakout input
+ const unsigned int min_thresh =
+ VPXMIN(((unsigned int)x->encode_breakout << 4), max_thresh);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int shift = (xd->bd << 1) - 16;
+#endif
+
+ // Calculate threshold according to dequant value.
+ thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+ thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
+
+ // Adjust ac threshold according to partition size.
+ thresh_ac >>=
+ 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+ thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
+ thresh_dc = ROUND_POWER_OF_TWO(thresh_dc, shift);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ thresh_ac = 0;
+ thresh_dc = 0;
+ }
+
+ // Y skipping condition checking for ac and dc.
+ if (var <= thresh_ac && (sse - var) <= thresh_dc) {
+ unsigned int sse_u, sse_v;
+ unsigned int var_u, var_v;
+ unsigned int thresh_ac_uv = thresh_ac;
+ unsigned int thresh_dc_uv = thresh_dc;
+ if (x->sb_is_skin) {
+ thresh_ac_uv = 0;
+ thresh_dc_uv = 0;
+ }
+
+ if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) {
+ xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
+ xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
+ vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
+ }
+
+ var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf, x->plane[1].src.stride,
+ xd->plane[1].dst.buf,
+ xd->plane[1].dst.stride, &sse_u);
+
+ // U skipping condition checking
+ if (((var_u << 2) <= thresh_ac_uv) && (sse_u - var_u <= thresh_dc_uv)) {
+ var_v = cpi->fn_ptr[uv_size].vf(
+ x->plane[2].src.buf, x->plane[2].src.stride, xd->plane[2].dst.buf,
+ xd->plane[2].dst.stride, &sse_v);
+
+ // V skipping condition checking
+ if (((var_v << 2) <= thresh_ac_uv) && (sse_v - var_v <= thresh_dc_uv)) {
+ x->skip = 1;
+
+ // The cost of skip bit needs to be added.
+ *rate = cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+ [INTER_OFFSET(this_mode)];
+
+ // More on this part of rate
+ // rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+ // Scaling factor for SSE from spatial domain to frequency
+ // domain is 16. Adjust distortion accordingly.
+ // TODO(yunqingwang): In this function, only y-plane dist is
+ // calculated.
+ *dist = (sse << 4); // + ((sse_u + sse_v) << 4);
+
+ // *disable_skip = 1;
+ }
+ }
+ }
+}
+
+struct estimate_block_intra_args {
+ VP9_COMP *cpi;
+ MACROBLOCK *x;
+ PREDICTION_MODE mode;
+ int skippable;
+ RD_COST *rdc;
+};
+
+static void estimate_block_intra(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct estimate_block_intra_args *const args = arg;
+ VP9_COMP *const cpi = args->cpi;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
+ uint8_t *const src_buf_base = p->src.buf;
+ uint8_t *const dst_buf_base = pd->dst.buf;
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ RD_COST this_rdc;
+
+ (void)block;
+
+ p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+ pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+ // Use source buffer as an approximation for the fully reconstructed buffer.
+ vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size,
+ args->mode, x->skip_encode ? p->src.buf : pd->dst.buf,
+ x->skip_encode ? src_stride : dst_stride, pd->dst.buf,
+ dst_stride, col, row, plane);
+
+ if (plane == 0) {
+ int64_t this_sse = INT64_MAX;
+ block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
+ VPXMIN(tx_size, TX_16X16), 0, 1);
+ } else {
+ unsigned int var = 0;
+ unsigned int sse = 0;
+ model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane,
+ plane);
+ }
+
+ p->src.buf = src_buf_base;
+ pd->dst.buf = dst_buf_base;
+ args->rdc->rate += this_rdc.rate;
+ args->rdc->dist += this_rdc.dist;
+}
+
+static const THR_MODES mode_idx[MAX_REF_FRAMES][4] = {
+ { THR_DC, THR_V_PRED, THR_H_PRED, THR_TM },
+ { THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV },
+ { THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG },
+ { THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA },
+};
+
+static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
+ TM_PRED };
+
+static int mode_offset(const PREDICTION_MODE mode) {
+ if (mode >= NEARESTMV) {
+ return INTER_OFFSET(mode);
+ } else {
+ switch (mode) {
+ case DC_PRED: return 0;
+ case V_PRED: return 1;
+ case H_PRED: return 2;
+ case TM_PRED: return 3;
+ default: return -1;
+ }
+ }
+}
+
+static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
+ const int *const thresh_fact) {
+ int is_rd_less_than_thresh;
+ is_rd_less_than_thresh =
+ best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+ return is_rd_less_than_thresh;
+}
+
+static INLINE void update_thresh_freq_fact_row_mt(
+ VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
+ int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
+ THR_MODES best_mode_idx, PREDICTION_MODE mode) {
+ THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ int freq_fact_idx = thresh_freq_fact_idx + thr_mode_idx;
+ int *freq_fact = &tile_data->row_base_thresh_freq_fact[freq_fact_idx];
+ if (thr_mode_idx == best_mode_idx)
+ *freq_fact -= (*freq_fact >> 4);
+ else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+ ref_frame == LAST_FRAME && source_variance < 5) {
+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+ } else {
+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+}
+
+static INLINE void update_thresh_freq_fact(
+ VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
+ BLOCK_SIZE bsize, MV_REFERENCE_FRAME ref_frame, THR_MODES best_mode_idx,
+ PREDICTION_MODE mode) {
+ THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+ int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+ if (thr_mode_idx == best_mode_idx)
+ *freq_fact -= (*freq_fact >> 4);
+ else if (cpi->sf.limit_newmv_early_exit && mode == NEWMV &&
+ ref_frame == LAST_FRAME && source_variance < 5) {
+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC, 32);
+ } else {
+ *freq_fact = VPXMIN(*freq_fact + RD_THRESH_INC,
+ cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+ }
+}
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ RD_COST this_rdc, best_rdc;
+ PREDICTION_MODE this_mode;
+ struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+ const TX_SIZE intra_tx_size =
+ VPXMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+ MODE_INFO *const mic = xd->mi[0];
+ int *bmode_costs;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+ const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+ bmode_costs = cpi->y_mode_costs[A][L];
+
+ (void)ctx;
+ vp9_rd_cost_reset(&best_rdc);
+ vp9_rd_cost_reset(&this_rdc);
+
+ mi->ref_frame[0] = INTRA_FRAME;
+ // Initialize interp_filter here so we do not have to check for inter block
+ // modes in get_pred_context_switchable_interp()
+ mi->interp_filter = SWITCHABLE_FILTERS;
+
+ mi->mv[0].as_int = INVALID_MV;
+ mi->uv_mode = DC_PRED;
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+ // Change the limit of this loop to add other intra prediction
+ // mode tests.
+ for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+ this_rdc.dist = this_rdc.rate = 0;
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->tx_size = intra_tx_size;
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
+ &args);
+ if (args.skippable) {
+ x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+ } else {
+ x->skip_txfm[0] = SKIP_TXFM_NONE;
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+ }
+ this_rdc.rate += bmode_costs[this_mode];
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ mi->mode = this_mode;
+ }
+ }
+
+ *rd_cost = best_rdc;
+}
+
+static void init_ref_frame_cost(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ int ref_frame_cost[MAX_REF_FRAMES]) {
+ vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+ vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+ ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+ ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+ ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+ ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+ ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame;
+ PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 12
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+ { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV },
+ { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV },
+ { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV },
+ { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV },
+ { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
+ { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
+};
+
+#define RT_INTER_MODES_SVC 8
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = {
+ { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV },
+ { LAST_FRAME, NEARMV }, { GOLDEN_FRAME, ZEROMV },
+ { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
+ { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEWMV }
+};
+
+static INLINE void find_predictors(
+ VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+ int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
+ TileDataEnc *tile_data, int mi_row, int mi_col,
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
+ int force_skip_low_temp_var, int comp_pred_allowed) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ // TODO(jingning) placeholder for inter-frame non-RD mode decision.
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ // this needs various further optimizations. to be continued..
+ if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
+ int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+ const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+ if (cm->use_prev_frame_mvs || comp_pred_allowed) {
+ vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
+ x->mbmi_ext->mode_context);
+ } else {
+ const_motion[ref_frame] =
+ mv_refs_rt(cpi, cm, x, xd, tile_info, xd->mi[0], ref_frame,
+ candidates, &frame_mv[NEWMV][ref_frame], mi_row, mi_col,
+ (int)(cpi->svc.use_base_mv && cpi->svc.spatial_layer_id));
+ }
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+ &frame_mv[NEARESTMV][ref_frame],
+ &frame_mv[NEARMV][ref_frame]);
+ // Early exit for golden frame if force_skip_low_temp_var is set.
+ if (!vp9_is_scaled(sf) && bsize >= BLOCK_8X8 &&
+ !(force_skip_low_temp_var && ref_frame == GOLDEN_FRAME)) {
+ vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ bsize);
+ }
+ } else {
+ *ref_frame_skip_mask |= (1 << ref_frame);
+ }
+}
+
+static void vp9_NEWMV_diff_bias(const NOISE_ESTIMATE *ne, MACROBLOCKD *xd,
+ PREDICTION_MODE this_mode, RD_COST *this_rdc,
+ BLOCK_SIZE bsize, int mv_row, int mv_col,
+ int is_last_frame, int lowvar_highsumdiff,
+ int is_skin) {
+ // Bias against MVs associated with NEWMV mode that are very different from
+ // top/left neighbors.
+ if (this_mode == NEWMV) {
+ int al_mv_average_row;
+ int al_mv_average_col;
+ int left_row, left_col;
+ int row_diff, col_diff;
+ int above_mv_valid = 0;
+ int left_mv_valid = 0;
+ int above_row = 0;
+ int above_col = 0;
+
+ if (xd->above_mi) {
+ above_mv_valid = xd->above_mi->mv[0].as_int != INVALID_MV;
+ above_row = xd->above_mi->mv[0].as_mv.row;
+ above_col = xd->above_mi->mv[0].as_mv.col;
+ }
+ if (xd->left_mi) {
+ left_mv_valid = xd->left_mi->mv[0].as_int != INVALID_MV;
+ left_row = xd->left_mi->mv[0].as_mv.row;
+ left_col = xd->left_mi->mv[0].as_mv.col;
+ }
+ if (above_mv_valid && left_mv_valid) {
+ al_mv_average_row = (above_row + left_row + 1) >> 1;
+ al_mv_average_col = (above_col + left_col + 1) >> 1;
+ } else if (above_mv_valid) {
+ al_mv_average_row = above_row;
+ al_mv_average_col = above_col;
+ } else if (left_mv_valid) {
+ al_mv_average_row = left_row;
+ al_mv_average_col = left_col;
+ } else {
+ al_mv_average_row = al_mv_average_col = 0;
+ }
+ row_diff = (al_mv_average_row - mv_row);
+ col_diff = (al_mv_average_col - mv_col);
+ if (row_diff > 48 || row_diff < -48 || col_diff > 48 || col_diff < -48) {
+ if (bsize > BLOCK_32X32)
+ this_rdc->rdcost = this_rdc->rdcost << 1;
+ else
+ this_rdc->rdcost = 3 * this_rdc->rdcost >> 1;
+ }
+ }
+ // If noise estimation is enabled, and estimated level is above threshold,
+ // add a bias to LAST reference with small motion, for large blocks.
+ if (ne->enabled && ne->level >= kMedium && bsize >= BLOCK_32X32 &&
+ is_last_frame && mv_row < 8 && mv_row > -8 && mv_col < 8 && mv_col > -8)
+ this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
+ else if (lowvar_highsumdiff && !is_skin && bsize >= BLOCK_16X16 &&
+ is_last_frame && mv_row < 16 && mv_row > -16 && mv_col < 16 &&
+ mv_col > -16)
+ this_rdc->rdcost = 7 * (this_rdc->rdcost >> 3);
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void vp9_pickmode_ctx_den_update(
+ VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
+ int ref_frame_cost[MAX_REF_FRAMES],
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred,
+ BEST_PICKMODE *bp) {
+ ctx_den->zero_last_cost_orig = zero_last_cost_orig;
+ ctx_den->ref_frame_cost = ref_frame_cost;
+ ctx_den->frame_mv = frame_mv;
+ ctx_den->reuse_inter_pred = reuse_inter_pred;
+ ctx_den->best_tx_size = bp->best_tx_size;
+ ctx_den->best_mode = bp->best_mode;
+ ctx_den->best_ref_frame = bp->best_ref_frame;
+ ctx_den->best_pred_filter = bp->best_pred_filter;
+ ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
+}
+
+static void recheck_zeromv_after_denoising(
+ VP9_COMP *cpi, MODE_INFO *const mi, MACROBLOCK *x, MACROBLOCKD *const xd,
+ VP9_DENOISER_DECISION decision, VP9_PICKMODE_CTX_DEN *ctx_den,
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE], RD_COST *best_rdc, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ // If INTRA or GOLDEN reference was selected, re-evaluate ZEROMV on
+ // denoised result. Only do this under noise conditions, and if rdcost of
+ // ZEROMV onoriginal source is not significantly higher than rdcost of best
+ // mode.
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level > kLow &&
+ ctx_den->zero_last_cost_orig < (best_rdc->rdcost << 3) &&
+ ((ctx_den->best_ref_frame == INTRA_FRAME && decision >= FILTER_BLOCK) ||
+ (ctx_den->best_ref_frame == GOLDEN_FRAME &&
+ cpi->svc.number_spatial_layers == 1 &&
+ decision == FILTER_ZEROMV_BLOCK))) {
+ // Check if we should pick ZEROMV on denoised signal.
+ VP9_COMMON *const cm = &cpi->common;
+ int rate = 0;
+ int64_t dist = 0;
+ uint32_t var_y = UINT_MAX;
+ uint32_t sse_y = UINT_MAX;
+ RD_COST this_rdc;
+ mi->mode = ZEROMV;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+ mi->mv[0].as_int = 0;
+ mi->interp_filter = EIGHTTAP;
+ if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
+ xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0);
+ this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] +
+ cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
+ [INTER_OFFSET(ZEROMV)];
+ this_rdc.dist = dist;
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist);
+ // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source
+ // is higher than best_ref mode (on original source).
+ if (this_rdc.rdcost > best_rdc->rdcost) {
+ this_rdc = *best_rdc;
+ mi->mode = ctx_den->best_mode;
+ mi->ref_frame[0] = ctx_den->best_ref_frame;
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+ mi->interp_filter = ctx_den->best_pred_filter;
+ if (ctx_den->best_ref_frame == INTRA_FRAME) {
+ mi->mv[0].as_int = INVALID_MV;
+ mi->interp_filter = SWITCHABLE_FILTERS;
+ } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) {
+ mi->mv[0].as_int =
+ ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame]
+ .as_int;
+ if (ctx_den->reuse_inter_pred) {
+ xd->plane[0].pre[0] = yv12_mb[GOLDEN_FRAME][0];
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ }
+ }
+ mi->tx_size = ctx_den->best_tx_size;
+ x->skip_txfm[0] = ctx_den->best_mode_skip_txfm;
+ } else {
+ ctx_den->best_ref_frame = LAST_FRAME;
+ *best_rdc = this_rdc;
+ }
+ }
+}
+#endif // CONFIG_VP9_TEMPORAL_DENOISING
+
+static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const int i = (mi_row & 0x7) >> 1;
+ const int j = (mi_col & 0x7) >> 1;
+ int force_skip_low_temp_var = 0;
+ // Set force_skip_low_temp_var based on the block size and block offset.
+ if (bsize == BLOCK_64X64) {
+ force_skip_low_temp_var = variance_low[0];
+ } else if (bsize == BLOCK_64X32) {
+ if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[1];
+ } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[2];
+ }
+ } else if (bsize == BLOCK_32X64) {
+ if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[3];
+ } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[4];
+ }
+ } else if (bsize == BLOCK_32X32) {
+ if (!(mi_col & 0x7) && !(mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[5];
+ } else if ((mi_col & 0x7) && !(mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[6];
+ } else if (!(mi_col & 0x7) && (mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[7];
+ } else if ((mi_col & 0x7) && (mi_row & 0x7)) {
+ force_skip_low_temp_var = variance_low[8];
+ }
+ } else if (bsize == BLOCK_16X16) {
+ force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]];
+ } else if (bsize == BLOCK_32X16) {
+ // The col shift index for the second 16x16 block.
+ const int j2 = ((mi_col + 2) & 0x7) >> 1;
+ // Only if each 16x16 block inside has low temporal variance.
+ force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+ variance_low[pos_shift_16x16[i][j2]];
+ } else if (bsize == BLOCK_16X32) {
+ // The row shift index for the second 16x16 block.
+ const int i2 = ((mi_row + 2) & 0x7) >> 1;
+ force_skip_low_temp_var = variance_low[pos_shift_16x16[i][j]] &&
+ variance_low[pos_shift_16x16[i2][j]];
+ }
+ return force_skip_low_temp_var;
+}
+
+static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+ int mi_row, int mi_col, PRED_BUFFER *tmp,
+ BLOCK_SIZE bsize, int reuse_inter_pred,
+ PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+ unsigned int *sse_y, int force_smooth_filter,
+ int *this_early_term, int *flag_preduv_computed,
+ int use_model_yrd_large) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+
+ int pf_rate[3] = { 0 };
+ int64_t pf_dist[3] = { 0 };
+ int curr_rate[3] = { 0 };
+ unsigned int pf_var[3] = { 0 };
+ unsigned int pf_sse[3] = { 0 };
+ TX_SIZE pf_tx_size[3] = { 0 };
+ int64_t best_cost = INT64_MAX;
+ INTERP_FILTER best_filter = SWITCHABLE, filter;
+ PRED_BUFFER *current_pred = *this_mode_pred;
+ uint8_t skip_txfm = SKIP_TXFM_NONE;
+ int best_early_term = 0;
+ int best_flag_preduv_computed[2] = { 0 };
+ INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP;
+ INTERP_FILTER filter_end = EIGHTTAP_SMOOTH;
+ for (filter = filter_start; filter <= filter_end; ++filter) {
+ int64_t cost;
+ mi->interp_filter = filter;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ // For large partition blocks, extra testing is done.
+ if (use_model_yrd_large)
+ model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter],
+ &pf_dist[filter], &pf_var[filter],
+ &pf_sse[filter], mi_row, mi_col, this_early_term,
+ flag_preduv_computed);
+ else
+ model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+ &pf_var[filter], &pf_sse[filter], 0);
+ curr_rate[filter] = pf_rate[filter];
+ pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+ cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+ pf_tx_size[filter] = mi->tx_size;
+ if (cost < best_cost) {
+ best_filter = filter;
+ best_cost = cost;
+ skip_txfm = x->skip_txfm[0];
+ best_early_term = *this_early_term;
+ best_flag_preduv_computed[0] = flag_preduv_computed[0];
+ best_flag_preduv_computed[1] = flag_preduv_computed[1];
+
+ if (reuse_inter_pred) {
+ if (*this_mode_pred != current_pred) {
+ free_pred_buffer(*this_mode_pred);
+ *this_mode_pred = current_pred;
+ }
+ if (filter != filter_end) {
+ current_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = current_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
+ }
+ }
+
+ if (reuse_inter_pred && *this_mode_pred != current_pred)
+ free_pred_buffer(current_pred);
+
+ mi->interp_filter = best_filter;
+ mi->tx_size = pf_tx_size[best_filter];
+ this_rdc->rate = curr_rate[best_filter];
+ this_rdc->dist = pf_dist[best_filter];
+ *var_y = pf_var[best_filter];
+ *sse_y = pf_sse[best_filter];
+ x->skip_txfm[0] = skip_txfm;
+ *this_early_term = best_early_term;
+ flag_preduv_computed[0] = best_flag_preduv_computed[0];
+ flag_preduv_computed[1] = best_flag_preduv_computed[1];
+ if (reuse_inter_pred) {
+ pd->dst.buf = (*this_mode_pred)->data;
+ pd->dst.stride = (*this_mode_pred)->stride;
+ } else if (best_filter < filter_end) {
+ mi->interp_filter = best_filter;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+ }
+}
+
+static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
+ int_mv frame_mv[][MAX_REF_FRAMES],
+ MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int best_pred_sad, int *rate_mv,
+ unsigned int best_sse_sofar, RD_COST *best_rdc) {
+ SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ SPEED_FEATURES *const sf = &cpi->sf;
+
+ if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+ cpi->oxcf.rc_mode == VPX_CBR) {
+ int tmp_sad;
+ uint32_t dis;
+ int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+ if (bsize < BLOCK_16X16) return -1;
+
+ tmp_sad = vp9_int_pro_motion_estimation(
+ cpi, x, bsize, mi_row, mi_col,
+ &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
+
+ if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+ if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+ frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+ *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+ &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+ frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+ cpi->find_fractional_mv_step(
+ x, &frame_mv[NEWMV][ref_frame].as_mv,
+ &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
+ } else if (svc->use_base_mv && svc->spatial_layer_id) {
+ if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
+ const int pre_stride = xd->plane[0].pre[0].stride;
+ unsigned int base_mv_sse = UINT_MAX;
+ int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
+ const uint8_t *const pre_buf =
+ xd->plane[0].pre[0].buf +
+ (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+ (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+ cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+ pre_buf, pre_stride, &base_mv_sse);
+
+ // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+ // for SVC encoding.
+ if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
+ frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+ frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+ return -1;
+
+ // Exit NEWMV search if base_mv_sse is large.
+ if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
+ return -1;
+ if ((base_mv_sse >> 1) < best_sse_sofar) {
+ // Base layer mv is good.
+ // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+ // (0, 0) mode is already tested.
+ unsigned int base_mv_sse_normalized =
+ base_mv_sse >>
+ (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+ base_mv_sse_normalized < 400 &&
+ frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+ frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+ return -1;
+ if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame], rate_mv,
+ best_rdc->rdcost, 1)) {
+ return -1;
+ }
+ } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame], rate_mv,
+ best_rdc->rdcost, 0)) {
+ return -1;
+ }
+ } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame], rate_mv,
+ best_rdc->rdcost, 0)) {
+ return -1;
+ }
+ } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+ &frame_mv[NEWMV][ref_frame], rate_mv,
+ best_rdc->rdcost, 0)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+ bp->best_mode = ZEROMV;
+ bp->best_ref_frame = LAST_FRAME;
+ bp->best_tx_size = TX_SIZES;
+ bp->best_intra_tx_size = TX_SIZES;
+ bp->best_pred_filter = EIGHTTAP;
+ bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
+ bp->best_second_ref_frame = NONE;
+ bp->best_pred = NULL;
+}
+
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ SVC *const svc = &cpi->svc;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+
+ BEST_PICKMODE best_pickmode;
+
+ MV_REFERENCE_FRAME ref_frame;
+ MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ RD_COST this_rdc, best_rdc;
+ // var_y and sse_y are saved to be used in skipping checking
+ unsigned int var_y = UINT_MAX;
+ unsigned int sse_y = UINT_MAX;
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
+ int64_t inter_mode_thresh =
+ RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
+ const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
+ const int sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+ int thresh_freq_fact_idx = (sb_row * BLOCK_SIZES + bsize) * MAX_MODES;
+ const int *const rd_thresh_freq_fact =
+ (cpi->sf.adaptive_rd_thresh_row_mt)
+ ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
+ : tile_data->thresh_freq_fact[bsize];
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ const int denoise_recheck_zeromv = 1;
+#endif
+ INTERP_FILTER filter_ref;
+ int pred_filter_search = cm->interp_filter == SWITCHABLE;
+ int const_motion[MAX_REF_FRAMES] = { 0 };
+ const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
+ const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ // For speed 6, the result of interp filter is reused later in actual encoding
+ // process.
+ // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
+ PRED_BUFFER tmp[4];
+ DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64] VPX_UNINITIALIZED);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64] VPX_UNINITIALIZED);
+#endif
+ struct buf_2d orig_dst = pd->dst;
+ PRED_BUFFER *this_mode_pred = NULL;
+ const int pixels_in_block = bh * bw;
+ int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
+ int ref_frame_skip_mask = 0;
+ int idx;
+ int best_pred_sad = INT_MAX;
+ int best_early_term = 0;
+ int ref_frame_cost[MAX_REF_FRAMES];
+ int svc_force_zero_mode[3] = { 0 };
+ int perform_intra_pred = 1;
+ int use_golden_nonzeromv = 1;
+ int force_skip_low_temp_var = 0;
+ int skip_ref_find_pred[4] = { 0 };
+ unsigned int sse_zeromv_normalized = UINT_MAX;
+ unsigned int best_sse_sofar = UINT_MAX;
+ int gf_temporal_ref = 0;
+ int force_test_gf_zeromv = 0;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ VP9_PICKMODE_CTX_DEN ctx_den;
+ int64_t zero_last_cost_orig = INT64_MAX;
+ int denoise_svc_pickmode = 1;
+#endif
+ INTERP_FILTER filter_gf_svc = EIGHTTAP;
+ MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME;
+ const struct segmentation *const seg = &cm->seg;
+ int comp_modes = 0;
+ int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+ int flag_svc_subpel = 0;
+ int svc_mv_col = 0;
+ int svc_mv_row = 0;
+ int no_scaling = 0;
+ int large_block = 0;
+ int use_model_yrd_large = 0;
+ unsigned int thresh_svc_skip_golden = 500;
+ unsigned int thresh_skip_golden = 500;
+ int force_smooth_filter = cpi->sf.force_smooth_interpol;
+ int scene_change_detected =
+ cpi->rc.high_source_sad ||
+ (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+
+ init_best_pickmode(&best_pickmode);
+
+ x->encode_breakout = seg->enabled
+ ? cpi->segment_encode_breakout[mi->segment_id]
+ : cpi->encode_breakout;
+
+ x->source_variance = UINT_MAX;
+ if (cpi->sf.default_interp_filter == BILINEAR) {
+ best_pickmode.best_pred_filter = BILINEAR;
+ filter_gf_svc = BILINEAR;
+ }
+ if (cpi->use_svc && svc->spatial_layer_id > 0) {
+ int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
+ }
+ if (svc->spatial_layer_id > 0 &&
+ (svc->high_source_sad_superframe || no_scaling))
+ thresh_svc_skip_golden = 0;
+ // Lower the skip threshold if lower spatial layer is better quality relative
+ // to current layer.
+ else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 &&
+ cm->base_qindex > svc->lower_layer_qindex + 15)
+ thresh_svc_skip_golden = 100;
+ // Increase skip threshold if lower spatial layer is lower quality relative
+ // to current layer.
+ else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 &&
+ cm->base_qindex < svc->lower_layer_qindex - 20)
+ thresh_svc_skip_golden = 1000;
+
+ if (!cpi->use_svc ||
+ (svc->use_gf_temporal_ref_current_layer &&
+ !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+ struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf;
+ struct scale_factors *const sf_golden =
+ &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+ gf_temporal_ref = 1;
+ // For temporal long term prediction, check that the golden reference
+ // is same scale as last reference, otherwise disable.
+ if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
+ (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
+ gf_temporal_ref = 0;
+ } else {
+ if (cpi->rc.avg_frame_low_motion > 70)
+ thresh_svc_skip_golden = 500;
+ else
+ thresh_svc_skip_golden = 0;
+ }
+ }
+
+ init_ref_frame_cost(cm, xd, ref_frame_cost);
+ memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
+
+ if (reuse_inter_pred) {
+ int i;
+ for (i = 0; i < 3; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ tmp[i].data = CONVERT_TO_BYTEPTR(&pred_buf_16[pixels_in_block * i]);
+ else
+ tmp[i].data = &pred_buf[pixels_in_block * i];
+#else
+ tmp[i].data = &pred_buf[pixels_in_block * i];
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ tmp[i].stride = bw;
+ tmp[i].in_use = 0;
+ }
+ tmp[3].data = pd->dst.buf;
+ tmp[3].stride = pd->dst.stride;
+ tmp[3].in_use = 0;
+ }
+
+ x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ x->skip = 0;
+
+ if (cpi->sf.cb_pred_filter_search) {
+ const int bsl = mi_width_log2_lookup[bsize];
+ pred_filter_search = cm->interp_filter == SWITCHABLE
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1
+ : 0;
+ }
+ // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign
+ // filter_ref, we use a less strict condition on assigning filter_ref.
+ // This is to reduce the probabily of entering the flow of not assigning
+ // filter_ref and then skip filter search.
+ filter_ref = cm->interp_filter;
+ if (cpi->sf.default_interp_filter != BILINEAR) {
+ if (xd->above_mi && is_inter_block(xd->above_mi))
+ filter_ref = xd->above_mi->interp_filter;
+ else if (xd->left_mi && is_inter_block(xd->left_mi))
+ filter_ref = xd->left_mi->interp_filter;
+ }
+
+ // initialize mode decisions
+ vp9_rd_cost_reset(&best_rdc);
+ vp9_rd_cost_reset(rd_cost);
+ mi->sb_type = bsize;
+ mi->ref_frame[0] = NONE;
+ mi->ref_frame[1] = NONE;
+
+ mi->tx_size =
+ VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
+
+ if (sf->short_circuit_flat_blocks || sf->limit_newmv_early_exit) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ x->source_variance = vp9_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ else
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ x->source_variance =
+ vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 &&
+ x->zero_temp_sad_source && x->source_variance == 0) {
+ mi->segment_id = 0;
+ vp9_init_plane_quantizers(cpi, x);
+ }
+ }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi);
+ if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
+ vp9_denoiser_reset_frame_stats(ctx);
+ }
+#endif
+
+ if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref &&
+ !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
+ usable_ref_frame = LAST_FRAME;
+ } else {
+ usable_ref_frame = GOLDEN_FRAME;
+ }
+
+ if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+ if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)
+ usable_ref_frame = ALTREF_FRAME;
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ skip_ref_find_pred[LAST_FRAME] = 1;
+ skip_ref_find_pred[GOLDEN_FRAME] = 1;
+ }
+ if (!cm->show_frame) {
+ if (cpi->rc.frames_since_key == 1) {
+ usable_ref_frame = LAST_FRAME;
+ skip_ref_find_pred[GOLDEN_FRAME] = 1;
+ skip_ref_find_pred[ALTREF_FRAME] = 1;
+ }
+ }
+ }
+
+ // For svc mode, on spatial_layer_id > 0: if the reference has different scale
+ // constrain the inter mode to only test zero motion.
+ if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
+ svc->spatial_layer_id > 0 && !gf_temporal_ref) {
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+ if (vp9_is_scaled(ref_sf)) {
+ svc_force_zero_mode[LAST_FRAME - 1] = 1;
+ inter_layer_ref = LAST_FRAME;
+ }
+ }
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+ if (vp9_is_scaled(ref_sf)) {
+ svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+ inter_layer_ref = GOLDEN_FRAME;
+ }
+ }
+ }
+
+ if (cpi->sf.short_circuit_low_temp_var) {
+ force_skip_low_temp_var =
+ get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize);
+ // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3,
+ // skip golden reference.
+ if ((cpi->sf.short_circuit_low_temp_var == 1 ||
+ cpi->sf.short_circuit_low_temp_var == 3) &&
+ force_skip_low_temp_var) {
+ usable_ref_frame = LAST_FRAME;
+ }
+ }
+
+ if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad ||
+ cpi->rc.avg_frame_low_motion < 60))
+ usable_ref_frame = LAST_FRAME;
+
+ if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+ !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
+ use_golden_nonzeromv = 0;
+
+ if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
+ ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
+ x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
+ usable_ref_frame = LAST_FRAME;
+
+ // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF.
+ if (cm->reference_mode == REFERENCE_MODE_SELECT &&
+ cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
+ comp_modes = 2;
+
+ // If the segment reference frame feature is enabled and it's set to GOLDEN
+ // reference, then make sure we don't skip checking GOLDEN, this is to
+ // prevent possibility of not picking any mode.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+ usable_ref_frame = GOLDEN_FRAME;
+ skip_ref_find_pred[GOLDEN_FRAME] = 0;
+ thresh_svc_skip_golden = 0;
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+ // Skip find_predictor if the reference frame is not in the
+ // ref_frame_flags (i.e., not used as a reference for this frame).
+ skip_ref_find_pred[ref_frame] =
+ !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
+ if (!skip_ref_find_pred[ref_frame]) {
+ find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
+ &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+ bsize, force_skip_low_temp_var, comp_modes > 0);
+ }
+ }
+
+ if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
+ x->sb_use_mv_part = 0;
+
+ // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+ // an averaging filter for downsampling (phase = 8). If so, we will test
+ // a nonzero motion mode on the spatial reference.
+ // The nonzero motion is half pixel shifted to left and top (-4, -4).
+ if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+ svc_force_zero_mode[inter_layer_ref - 1] &&
+ svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+ !gf_temporal_ref) {
+ svc_mv_col = -4;
+ svc_mv_row = -4;
+ flag_svc_subpel = 1;
+ }
+
+ // For SVC with quality layers, when QP of lower layer is lower
+ // than current layer: force check of GF-ZEROMV before early exit
+ // due to skip flag.
+ if (svc->spatial_layer_id > 0 && no_scaling &&
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+ cm->base_qindex > svc->lower_layer_qindex + 10)
+ force_test_gf_zeromv = 1;
+
+ // For low motion content use x->sb_is_skin in addition to VeryHighSad
+ // for setting large_block.
+ large_block = (x->content_state_sb == kVeryHighSad ||
+ (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+ cpi->oxcf.speed < 7)
+ ? bsize > BLOCK_32X32
+ : bsize >= BLOCK_32X32;
+ use_model_yrd_large =
+ cpi->oxcf.rc_mode == VPX_CBR && large_block &&
+ !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+ cm->base_qindex;
+
+ for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
+ int rate_mv = 0;
+ int mode_rd_thresh;
+ int mode_index;
+ int i;
+ int64_t this_sse;
+ int is_skippable;
+ int this_early_term = 0;
+ int rd_computed = 0;
+ int flag_preduv_computed[2] = { 0 };
+ int inter_mv_mode = 0;
+ int skip_this_mv = 0;
+ int comp_pred = 0;
+ int force_mv_inter_layer = 0;
+ PREDICTION_MODE this_mode;
+ second_ref_frame = NONE;
+
+ if (idx < num_inter_modes) {
+ this_mode = ref_mode_set[idx].pred_mode;
+ ref_frame = ref_mode_set[idx].ref_frame;
+
+ if (cpi->use_svc) {
+ this_mode = ref_mode_set_svc[idx].pred_mode;
+ ref_frame = ref_mode_set_svc[idx].ref_frame;
+ }
+ } else {
+ // Add (0,0) compound modes.
+ this_mode = ZEROMV;
+ ref_frame = LAST_FRAME;
+ if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME;
+ second_ref_frame = ALTREF_FRAME;
+ comp_pred = 1;
+ }
+
+ if (ref_frame > usable_ref_frame) continue;
+ if (skip_ref_find_pred[ref_frame]) continue;
+
+ if (svc->previous_frame_is_intra_only) {
+ if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+ }
+
+ // If the segment reference frame feature is enabled then do nothing if the
+ // current ref frame is not allowed.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+ continue;
+
+ if (flag_svc_subpel && ref_frame == inter_layer_ref) {
+ force_mv_inter_layer = 1;
+ // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+ // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+ if (this_mode == NEWMV) {
+ frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+ frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+ } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+ frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+ continue;
+ }
+ }
+
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) continue;
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+ continue;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
+ }
+
+ // For CBR mode: skip the golden reference search if sse of zeromv_last is
+ // below threshold.
+ if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+ ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) ||
+ (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
+ continue;
+
+ if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
+
+ // For screen content. If zero_temp_sad source is computed: skip
+ // non-zero motion check for stationary blocks. If the superblock is
+ // non-stationary then for flat blocks skip the zero last check (keep golden
+ // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source
+ // is not computed) skip non-zero motion check for flat blocks.
+ // TODO(marpan): Compute zero_temp_sad_source per coding block.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+ if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+ if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
+ x->zero_temp_sad_source) ||
+ (frame_mv[this_mode][ref_frame].as_int == 0 &&
+ x->source_variance == 0 && ref_frame == LAST_FRAME &&
+ !x->zero_temp_sad_source))
+ continue;
+ } else if (frame_mv[this_mode][ref_frame].as_int != 0 &&
+ x->source_variance == 0) {
+ continue;
+ }
+ }
+
+ if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue;
+
+ if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) {
+ if (cpi->rc.is_src_frame_alt_ref &&
+ (ref_frame != ALTREF_FRAME ||
+ frame_mv[this_mode][ref_frame].as_int != 0))
+ continue;
+
+ if (!cm->show_frame && ref_frame == ALTREF_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+
+ if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+ cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
+ ref_frame == GOLDEN_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+
+ if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+ cpi->rc.frames_since_golden > 0 &&
+ cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
+ ref_frame == ALTREF_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+ }
+
+ if (const_motion[ref_frame] && this_mode == NEARMV) continue;
+
+ // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
+ // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
+ // later.
+ if (!force_mv_inter_layer && force_skip_low_temp_var &&
+ ref_frame == GOLDEN_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int != 0) {
+ continue;
+ }
+
+ if (x->content_state_sb != kVeryHighSad &&
+ (cpi->sf.short_circuit_low_temp_var >= 2 ||
+ (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
+ force_skip_low_temp_var && ref_frame == LAST_FRAME &&
+ this_mode == NEWMV) {
+ continue;
+ }
+
+ if (cpi->use_svc) {
+ if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] &&
+ frame_mv[this_mode][ref_frame].as_int != 0)
+ continue;
+ }
+
+ // Disable this drop out case if the ref frame segment level feature is
+ // enabled for this segment. This is to prevent the possibility that we end
+ // up unable to pick any mode.
+ if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+ if (sf->reference_masking &&
+ !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+ ref_frame == LAST_FRAME)) {
+ if (usable_ref_frame < ALTREF_FRAME) {
+ if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+ i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+ if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
+ if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ } else if (!cpi->rc.is_src_frame_alt_ref &&
+ !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+ ref_frame == ALTREF_FRAME)) {
+ int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+ int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+ if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
+ (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+ ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
+ (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ }
+ if (ref_frame_skip_mask & (1 << ref_frame)) continue;
+ }
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = second_ref_frame;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+ mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+ ? rd_threshes[mode_index] << 1
+ : rd_threshes[mode_index];
+
+ // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding
+ // speed with little/no subjective quality loss.
+ if (cpi->sf.bias_golden && ref_frame == GOLDEN_FRAME &&
+ cpi->rc.frames_since_golden > 4)
+ mode_rd_thresh = mode_rd_thresh << 3;
+
+ if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])) ||
+ (!cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])))
+ if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
+
+ if (this_mode == NEWMV && !force_mv_inter_layer) {
+ if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+ mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar,
+ &best_rdc))
+ continue;
+ }
+
+ // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
+ // causes some regression, leave it for duplicate zero-mv for now, until
+ // regression issue is resolved.
+ for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) {
+ if (inter_mv_mode == this_mode || comp_pred) continue;
+ if (mode_checked[inter_mv_mode][ref_frame] &&
+ frame_mv[this_mode][ref_frame].as_int ==
+ frame_mv[inter_mv_mode][ref_frame].as_int &&
+ frame_mv[inter_mv_mode][ref_frame].as_int == 0) {
+ skip_this_mv = 1;
+ break;
+ }
+ }
+
+ if (skip_this_mv) continue;
+
+ // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
+ // need to compute best_pred_sad which is only used to skip golden NEWMV.
+ if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME &&
+ frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
+ const int pre_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *const pre_buf =
+ xd->plane[0].pre[0].buf +
+ (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride +
+ (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3);
+ best_pred_sad = cpi->fn_ptr[bsize].sdf(
+ x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
+ x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
+ }
+
+ if (this_mode != NEARESTMV && !comp_pred &&
+ frame_mv[this_mode][ref_frame].as_int ==
+ frame_mv[NEARESTMV][ref_frame].as_int)
+ continue;
+
+ mi->mode = this_mode;
+ mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+ mi->mv[1].as_int = 0;
+
+ // Search for the best prediction filter type, when the resulting
+ // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+ // the last three bits are all zeros.
+ if (reuse_inter_pred) {
+ if (!this_mode_pred) {
+ this_mode_pred = &tmp[3];
+ } else {
+ this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+ pd->dst.buf = this_mode_pred->data;
+ pd->dst.stride = bw;
+ }
+ }
+
+ if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
+ pred_filter_search &&
+ (ref_frame == LAST_FRAME ||
+ (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer &&
+ (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
+ (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
+ rd_computed = 1;
+ search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+ reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
+ force_smooth_filter, &this_early_term,
+ flag_preduv_computed, use_model_yrd_large);
+ } else {
+ mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+
+ if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
+ svc_force_zero_mode[ref_frame - 1])
+ mi->interp_filter = filter_gf_svc;
+
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+ // For large partition blocks, extra testing is done.
+ if (use_model_yrd_large) {
+ rd_computed = 1;
+ model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+ &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+ &this_early_term, flag_preduv_computed);
+ } else {
+ rd_computed = 1;
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y, 0);
+ }
+ // Save normalized sse (between current and last frame) for (0, 0) motion.
+ if (ref_frame == LAST_FRAME &&
+ frame_mv[this_mode][ref_frame].as_int == 0) {
+ sse_zeromv_normalized =
+ sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+ }
+ if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
+ }
+
+ if (!this_early_term) {
+ this_sse = (int64_t)sse_y;
+ block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
+ VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0);
+ x->skip_txfm[0] = is_skippable;
+ if (is_skippable) {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ } else {
+ if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+ RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+ } else {
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ this_rdc.dist = this_sse;
+ x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+ }
+ }
+
+ if (cm->interp_filter == SWITCHABLE) {
+ if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
+ this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+ }
+ } else {
+ if (cm->interp_filter == SWITCHABLE) {
+ if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
+ this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+ }
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ }
+
+ if (!this_early_term &&
+ (x->color_sensitivity[0] || x->color_sensitivity[1])) {
+ RD_COST rdc_uv;
+ const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
+ if (x->color_sensitivity[0] && !flag_preduv_computed[0]) {
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+ flag_preduv_computed[0] = 1;
+ }
+ if (x->color_sensitivity[1] && !flag_preduv_computed[1]) {
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+ flag_preduv_computed[1] = 1;
+ }
+ model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
+ this_rdc.rate += rdc_uv.rate;
+ this_rdc.dist += rdc_uv.dist;
+ }
+
+ this_rdc.rate += rate_mv;
+ this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+ [INTER_OFFSET(this_mode)];
+ // TODO(marpan): Add costing for compound mode.
+ this_rdc.rate += ref_frame_cost[ref_frame];
+ this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+ // Bias against NEWMV that is very different from its neighbors, and bias
+ // to small motion-lastref for noisy input.
+ if (cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.speed >= 5 &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+ vp9_NEWMV_diff_bias(&cpi->noise_estimate, xd, this_mode, &this_rdc, bsize,
+ frame_mv[this_mode][ref_frame].as_mv.row,
+ frame_mv[this_mode][ref_frame].as_mv.col,
+ ref_frame == LAST_FRAME, x->lowvar_highsumdiff,
+ x->sb_is_skin);
+ }
+
+ // Skipping checking: test to see if this block can be reconstructed by
+ // prediction only.
+ if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected &&
+ !svc->high_num_blocks_with_motion) {
+ encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+ var_y, sse_y, yv12_mb, &this_rdc.rate,
+ &this_rdc.dist, flag_preduv_computed);
+ if (x->skip) {
+ this_rdc.rate += rate_mv;
+ this_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+ }
+ }
+
+ // On spatially flat blocks for screne content: bias against zero-last
+ // if the sse_y is non-zero. Only on scene change or high motion frames.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ (scene_change_detected || svc->high_num_blocks_with_motion) &&
+ ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 &&
+ svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) {
+ this_rdc.rdcost = this_rdc.rdcost << 2;
+ }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
+ cpi->denoiser.denoising_level > kDenLowLow) {
+ vp9_denoiser_update_frame_stats(mi, sse_y, this_mode, ctx);
+ // Keep track of zero_last cost.
+ if (ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0)
+ zero_last_cost_orig = this_rdc.rdcost;
+ }
+#else
+ (void)ctx;
+#endif
+
+ mode_checked[this_mode][ref_frame] = 1;
+
+ if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+ best_rdc = this_rdc;
+ best_early_term = this_early_term;
+ best_pickmode.best_mode = this_mode;
+ best_pickmode.best_pred_filter = mi->interp_filter;
+ best_pickmode.best_tx_size = mi->tx_size;
+ best_pickmode.best_ref_frame = ref_frame;
+ best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
+ best_pickmode.best_second_ref_frame = second_ref_frame;
+
+ if (reuse_inter_pred) {
+ free_pred_buffer(best_pickmode.best_pred);
+ best_pickmode.best_pred = this_mode_pred;
+ }
+ } else {
+ if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
+ }
+
+ if (x->skip &&
+ (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME]))
+ break;
+
+ // If early termination flag is 1 and at least 2 modes are checked,
+ // the mode search is terminated.
+ if (best_early_term && idx > 0 && !scene_change_detected &&
+ (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) {
+ x->skip = 1;
+ break;
+ }
+ }
+
+ mi->mode = best_pickmode.best_mode;
+ mi->interp_filter = best_pickmode.best_pred_filter;
+ mi->tx_size = best_pickmode.best_tx_size;
+ mi->ref_frame[0] = best_pickmode.best_ref_frame;
+ mi->mv[0].as_int =
+ frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
+ xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
+ x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
+ mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+
+ // For spatial enhancemanent layer: perform intra prediction only if base
+ // layer is chosen as the reference. Always perform intra prediction if
+ // LAST is the only reference, or is_key_frame is set, or on base
+ // temporal layer.
+ if (svc->spatial_layer_id && !gf_temporal_ref) {
+ perform_intra_pred =
+ svc->temporal_layer_id == 0 ||
+ svc->layer_context[svc->temporal_layer_id].is_key_frame ||
+ !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
+ (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+ svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
+ inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
+ }
+ if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+ cpi->rc.is_src_frame_alt_ref) ||
+ svc->previous_frame_is_intra_only)
+ perform_intra_pred = 0;
+
+ // If the segment reference frame feature is enabled and set then
+ // skip the intra prediction.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+ perform_intra_pred = 0;
+
+ // Perform intra prediction search, if the best SAD is above a certain
+ // threshold.
+ if (best_rdc.rdcost == INT64_MAX ||
+ (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) ||
+ (scene_change_detected && perform_intra_pred) ||
+ ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
+ x->content_state_sb == kVeryHighSad) &&
+ perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
+ bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
+ !x->lowvar_highsumdiff)) {
+ struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+ int64_t this_sse = INT64_MAX;
+ int i;
+ PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+ TX_SIZE intra_tx_size =
+ VPXMIN(max_txsize_lookup[bsize],
+ tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+
+ if (reuse_inter_pred && best_pred != NULL) {
+ if (best_pred->data == orig_dst.buf) {
+ this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ vpx_highbd_convolve_copy(
+ CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+ CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
+ NULL, 0, 0, 0, 0, bw, bh, xd->bd);
+ else
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride, NULL,
+ 0, 0, 0, 0, bw, bh);
+#else
+ vpx_convolve_copy(best_pred->data, best_pred->stride,
+ this_mode_pred->data, this_mode_pred->stride, NULL, 0,
+ 0, 0, 0, bw, bh);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ best_pickmode.best_pred = this_mode_pred;
+ }
+ }
+ pd->dst = orig_dst;
+
+ for (i = 0; i < 4; ++i) {
+ const PREDICTION_MODE this_mode = intra_mode_list[i];
+ THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+ int mode_rd_thresh = rd_threshes[mode_index];
+ // For spatially flat blocks, under short_circuit_flat_blocks flag:
+ // only check DC mode for stationary blocks, otherwise also check
+ // H and V mode.
+ if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
+ ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) {
+ continue;
+ }
+
+ if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
+ continue;
+
+ if (cpi->sf.rt_intra_dc_only_low_content && this_mode != DC_PRED &&
+ x->content_state_sb != kVeryHighSad)
+ continue;
+
+ if ((cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh_row_mt(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index])) ||
+ (!cpi->sf.adaptive_rd_thresh_row_mt &&
+ rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+ &rd_thresh_freq_fact[mode_index]))) {
+ // Avoid this early exit for screen on base layer, for scene
+ // changes or high motion frames.
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN ||
+ svc->spatial_layer_id > 0 ||
+ (!scene_change_detected && !svc->high_num_blocks_with_motion))
+ continue;
+ }
+
+ mi->mode = this_mode;
+ mi->ref_frame[0] = INTRA_FRAME;
+ this_rdc.dist = this_rdc.rate = 0;
+ args.mode = this_mode;
+ args.skippable = 1;
+ args.rdc = &this_rdc;
+ mi->tx_size = intra_tx_size;
+
+ compute_intra_yprediction(this_mode, bsize, x, xd);
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y, 1);
+ block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize,
+ VPXMIN(mi->tx_size, TX_16X16), 1, 1);
+
+ // Check skip cost here since skippable is not set for for uv, this
+ // mirrors the behavior used by inter
+ if (args.skippable) {
+ x->skip_txfm[0] = SKIP_TXFM_AC_DC;
+ this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1);
+ } else {
+ x->skip_txfm[0] = SKIP_TXFM_NONE;
+ this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0);
+ }
+ // Inter and intra RD will mismatch in scale for non-screen content.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+ if (x->color_sensitivity[0])
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 1,
+ estimate_block_intra, &args);
+ if (x->color_sensitivity[1])
+ vp9_foreach_transformed_block_in_plane(xd, bsize, 2,
+ estimate_block_intra, &args);
+ }
+ this_rdc.rate += cpi->mbmode_cost[this_mode];
+ this_rdc.rate += ref_frame_cost[INTRA_FRAME];
+ this_rdc.rate += intra_cost_penalty;
+ this_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = this_rdc;
+ best_pickmode.best_mode = this_mode;
+ best_pickmode.best_intra_tx_size = mi->tx_size;
+ best_pickmode.best_ref_frame = INTRA_FRAME;
+ best_pickmode.best_second_ref_frame = NONE;
+ mi->uv_mode = this_mode;
+ mi->mv[0].as_int = INVALID_MV;
+ mi->mv[1].as_int = INVALID_MV;
+ best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
+ }
+ }
+
+ // Reset mb_mode_info to the best inter mode.
+ if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+ mi->tx_size = best_pickmode.best_tx_size;
+ } else {
+ mi->tx_size = best_pickmode.best_intra_tx_size;
+ }
+ }
+
+ pd->dst = orig_dst;
+ mi->mode = best_pickmode.best_mode;
+ mi->ref_frame[0] = best_pickmode.best_ref_frame;
+ mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+ x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
+
+ if (!is_inter_block(mi)) {
+ mi->interp_filter = SWITCHABLE_FILTERS;
+ }
+
+ if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+ PRED_BUFFER *const best_pred = best_pickmode.best_pred;
+ if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth)
+ vpx_highbd_convolve_copy(
+ CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
+ CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
+ bw, bh, xd->bd);
+ else
+ vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+ pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
+#else
+ vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
+ pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity > 0 && cpi->resize_pending == 0 &&
+ denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
+ cpi->denoiser.reset == 0) {
+ VP9_DENOISER_DECISION decision = COPY_BLOCK;
+ ctx->sb_skip_denoising = 0;
+ // TODO(marpan): There is an issue with denoising when the
+ // superblock partitioning scheme is based on the pickmode.
+ // Remove this condition when the issue is resolved.
+ if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
+ vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
+ frame_mv, reuse_inter_pred, &best_pickmode);
+ vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+ gf_temporal_ref);
+ if (denoise_recheck_zeromv)
+ recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den,
+ yv12_mb, &best_rdc, bsize, mi_row, mi_col);
+ best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
+ }
+#endif
+
+ if (best_pickmode.best_ref_frame == ALTREF_FRAME ||
+ best_pickmode.best_second_ref_frame == ALTREF_FRAME)
+ x->arf_frame_usage++;
+ else if (best_pickmode.best_ref_frame != INTRA_FRAME)
+ x->lastgolden_frame_usage++;
+
+ if (cpi->sf.adaptive_rd_thresh) {
+ THR_MODES best_mode_idx =
+ mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
+
+ if (best_pickmode.best_ref_frame == INTRA_FRAME) {
+ // Only consider the modes that are included in the intra_mode_list.
+ int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
+ int i;
+
+ // TODO(yunqingwang): Check intra mode mask and only update freq_fact
+ // for those valid modes.
+ for (i = 0; i < intra_modes; i++) {
+ if (cpi->sf.adaptive_rd_thresh_row_mt)
+ update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+ thresh_freq_fact_idx, INTRA_FRAME,
+ best_mode_idx, intra_mode_list[i]);
+ else
+ update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+ INTRA_FRAME, best_mode_idx,
+ intra_mode_list[i]);
+ }
+ } else {
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ PREDICTION_MODE this_mode;
+ if (best_pickmode.best_ref_frame != ref_frame) continue;
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ if (cpi->sf.adaptive_rd_thresh_row_mt)
+ update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
+ thresh_freq_fact_idx, ref_frame,
+ best_mode_idx, this_mode);
+ else
+ update_thresh_freq_fact(cpi, tile_data, x->source_variance, bsize,
+ ref_frame, best_mode_idx, this_mode);
+ }
+ }
+ }
+ }
+
+ *rd_cost = best_rdc;
+}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx) {
+ VP9_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const struct segmentation *const seg = &cm->seg;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+ MV_REFERENCE_FRAME best_ref_frame = NONE;
+ unsigned char segment_id = mi->segment_id;
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ int64_t best_rd = INT64_MAX;
+ b_mode_info bsi[MAX_REF_FRAMES][4];
+ int ref_frame_skip_mask = 0;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ ctx->pred_pixel_ready = 0;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ int_mv dummy_mv[2];
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+
+ if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+ (yv12 != NULL)) {
+ int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
+ const struct scale_factors *const ref_sf =
+ &cm->frame_refs[ref_frame - 1].sf;
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+ ref_sf);
+ vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
+ mbmi_ext->mode_context);
+
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+ &dummy_mv[0], &dummy_mv[1]);
+ } else {
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ }
+
+ mi->sb_type = bsize;
+ mi->tx_size = TX_4X4;
+ mi->uv_mode = DC_PRED;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ mi->interp_filter =
+ cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+ int64_t this_rd = 0;
+ int plane;
+
+ if (ref_frame_skip_mask & (1 << ref_frame)) continue;
+
+#if CONFIG_BETTER_HW_COMPATIBILITY
+ if ((bsize == BLOCK_8X4 || bsize == BLOCK_4X8) && ref_frame > INTRA_FRAME &&
+ vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+ continue;
+#endif
+
+ // TODO(jingning, agrange): Scaling reference frame not supported for
+ // sub8x8 blocks. Is this supported now?
+ if (ref_frame > INTRA_FRAME &&
+ vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+ continue;
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+ continue;
+
+ mi->ref_frame[0] = ref_frame;
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (plane = 0; plane < MAX_MB_PLANE; plane++)
+ xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ int_mv b_mv[MB_MODE_COUNT];
+ int64_t b_best_rd = INT64_MAX;
+ const int i = idy * 2 + idx;
+ PREDICTION_MODE this_mode;
+ RD_COST this_rdc;
+ unsigned int var_y, sse_y;
+
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+
+ const struct buf_2d orig_src = p->src;
+ const struct buf_2d orig_dst = pd->dst;
+ struct buf_2d orig_pre[2];
+ memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+ // set buffer pointers for sub8x8 motion search.
+ p->src.buf =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ pd->dst.buf =
+ &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+ pd->pre[0].buf =
+ &pd->pre[0]
+ .buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)];
+
+ b_mv[ZEROMV].as_int = 0;
+ b_mv[NEWMV].as_int = INVALID_MV;
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col,
+ &b_mv[NEARESTMV], &b_mv[NEARMV],
+ mbmi_ext->mode_context);
+
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ int b_rate = 0;
+ xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+ if (this_mode == NEWMV) {
+ const int step_param = cpi->sf.mv.fullpel_search_step_param;
+ MV mvp_full;
+ MV tmp_mv;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ uint32_t dummy_dist;
+
+ if (i == 0) {
+ mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+ mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+ } else {
+ mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3;
+ mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3;
+ }
+
+ vp9_set_mv_search_range(&x->mv_limits,
+ &mbmi_ext->ref_mvs[ref_frame][0].as_mv);
+
+ vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+ x->sadperbit4, cond_cost_list(cpi, cost_list),
+ &mbmi_ext->ref_mvs[ref_frame][0].as_mv, &tmp_mv, INT_MAX, 0);
+
+ x->mv_limits = tmp_mv_limits;
+
+ // calculate the bit cost on motion vector
+ mvp_full.row = tmp_mv.row * 8;
+ mvp_full.col = tmp_mv.col * 8;
+
+ b_rate += vp9_mv_bit_cost(
+ &mvp_full, &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+ b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+ [INTER_OFFSET(NEWMV)];
+ if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd) continue;
+
+ cpi->find_fractional_mv_step(
+ x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dummy_dist,
+ &x->pred_sse[ref_frame], NULL, 0, 0,
+ cpi->sf.use_accurate_subpel_search);
+
+ xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
+ } else {
+ b_rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
+ [INTER_OFFSET(this_mode)];
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(pd->pre[0].buf), pd->pre[0].stride,
+ CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride,
+ &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf,
+ 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0,
+ vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i & 0x01),
+ mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+ } else {
+#endif
+ vp9_build_inter_predictor(
+ pd->pre[0].buf, pd->pre[0].stride, pd->dst.buf, pd->dst.stride,
+ &xd->mi[0]->bmi[i].as_mv[0].as_mv, &xd->block_refs[0]->sf,
+ 4 * num_4x4_blocks_wide, 4 * num_4x4_blocks_high, 0,
+ vp9_filter_kernels[mi->interp_filter], MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i & 0x01),
+ mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif
+
+ model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+ &var_y, &sse_y, 0);
+
+ this_rdc.rate += b_rate;
+ this_rdc.rdcost =
+ RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+ if (this_rdc.rdcost < b_best_rd) {
+ b_best_rd = this_rdc.rdcost;
+ bsi[ref_frame][i].as_mode = this_mode;
+ bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv;
+ }
+ } // mode search
+
+ // restore source and prediction buffer pointers.
+ p->src = orig_src;
+ pd->pre[0] = orig_pre[0];
+ pd->dst = orig_dst;
+ this_rd += b_best_rd;
+
+ xd->mi[0]->bmi[i] = bsi[ref_frame][i];
+ if (num_4x4_blocks_wide > 1) xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i];
+ if (num_4x4_blocks_high > 1) xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i];
+ }
+ } // loop through sub8x8 blocks
+
+ if (this_rd < best_rd) {
+ best_rd = this_rd;
+ best_ref_frame = ref_frame;
+ }
+ } // reference frames
+
+ mi->tx_size = TX_4X4;
+ mi->ref_frame[0] = best_ref_frame;
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ const int block = idy * 2 + idx;
+ xd->mi[0]->bmi[block] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_wide > 1)
+ xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block];
+ if (num_4x4_blocks_high > 1)
+ xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
+ }
+ }
+ mi->mode = xd->mi[0]->bmi[3].as_mode;
+ ctx->mic = *(xd->mi[0]);
+ ctx->mbmi_ext = *x->mbmi_ext;
+ ctx->skip_txfm[0] = SKIP_TXFM_NONE;
+ ctx->skip = 0;
+ // Dummy assignment for speed -5. No effect in speed -6.
+ rd_cost->rdcost = best_rd;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h
new file mode 100644
index 0000000000..15207e6cf4
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_pickmode.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_
+#define VPX_VP9_ENCODER_VP9_PICKMODE_H_
+
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
+ int mi_row, int mi_col, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_PICKMODE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
new file mode 100644
index 0000000000..115c66723d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+ tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+ if (tmp) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i;
+ int eob = -1;
+
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+ const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+ qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+ if (abs_qcoeff) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
+
+// TODO(jingning) Refactor this file and combine functions with similar
+// operations.
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int tmp = 0;
+ int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+ qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ }
+
+ if (tmp) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_quantize_fp_32x32_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ int i, eob = -1;
+
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ for (i = 0; i < n_coeffs; i++) {
+ int abs_qcoeff = 0;
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ }
+
+ if (abs_qcoeff) eob = i;
+ }
+ *eob_ptr = eob + 1;
+}
+#endif
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+ unsigned t;
+ int l, m;
+ t = d;
+ for (l = 0; t > 1; l++) t >>= 1;
+ m = 1 + (1 << (16 + l)) / d;
+ *quant = (int16_t)(m - (1 << 16));
+ *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+ const int quant = vp9_dc_quant(q, 0, bit_depth);
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth) {
+ case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+ case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+ default:
+ assert(bit_depth == VPX_BITS_12);
+ return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+ }
+#else
+ (void)bit_depth;
+ return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+#endif
+}
+
+void vp9_init_quantizer(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ QUANTS *const quants = &cpi->quants;
+ int i, q, quant;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+ int qrounding_factor = q == 0 ? 64 : 48;
+ const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7;
+
+ if (cpi->oxcf.sharpness > 0 && q > 0) {
+ qzbin_factor = 64 + sharpness_adjustment;
+ qrounding_factor = 64 - sharpness_adjustment;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ int qrounding_factor_fp = i == 0 ? 48 : 42;
+ if (q == 0) qrounding_factor_fp = 64;
+ if (cpi->oxcf.sharpness > 0)
+ qrounding_factor_fp = 64 - sharpness_adjustment;
+ // y
+ quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
+ : vp9_ac_quant(q, 0, cm->bit_depth);
+ invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant;
+ quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+ quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
+ cpi->y_dequant[q][i] = quant;
+
+ // uv
+ quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
+ : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
+ invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i],
+ quant);
+ quants->uv_quant_fp[q][i] = (1 << 16) / quant;
+ quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
+ quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
+ quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
+ cpi->uv_dequant[q][i] = quant;
+ }
+
+ for (i = 2; i < 8; i++) {
+ quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+ quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+ quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+ quants->y_zbin[q][i] = quants->y_zbin[q][1];
+ quants->y_round[q][i] = quants->y_round[q][1];
+ cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
+
+ quants->uv_quant[q][i] = quants->uv_quant[q][1];
+ quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
+ quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
+ quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
+ quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
+ quants->uv_round[q][i] = quants->uv_round[q][1];
+ cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
+ }
+ }
+}
+
+void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
+ const VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ QUANTS *const quants = &cpi->quants;
+ const int segment_id = xd->mi[0]->segment_id;
+ const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+ int i;
+
+ // Y
+ x->plane[0].quant = quants->y_quant[qindex];
+ x->plane[0].quant_fp = quants->y_quant_fp[qindex];
+ x->plane[0].round_fp = quants->y_round_fp[qindex];
+ x->plane[0].quant_shift = quants->y_quant_shift[qindex];
+ x->plane[0].zbin = quants->y_zbin[qindex];
+ x->plane[0].round = quants->y_round[qindex];
+ xd->plane[0].dequant = cpi->y_dequant[qindex];
+ x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+ x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
+
+ // UV
+ for (i = 1; i < 3; i++) {
+ x->plane[i].quant = quants->uv_quant[qindex];
+ x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
+ x->plane[i].round_fp = quants->uv_round_fp[qindex];
+ x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
+ x->plane[i].zbin = quants->uv_zbin[qindex];
+ x->plane[i].round = quants->uv_round[qindex];
+ xd->plane[i].dequant = cpi->uv_dequant[qindex];
+ x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+ x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
+ }
+
+ x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+ x->q_index = qindex;
+
+ set_error_per_bit(x, rdmult);
+
+ vp9_initialize_me_consts(cpi, x, x->q_index);
+}
+
+void vp9_frame_init_quantizer(VP9_COMP *cpi) {
+ vp9_init_plane_quantizers(cpi, &cpi->td.mb);
+}
+
+void vp9_set_quantizer(VP9_COMP *cpi, int q) {
+ VP9_COMMON *cm = &cpi->common;
+ // quantizer has to be reinitialized with vp9_init_quantizer() if any
+ // delta_q changes.
+ cm->base_qindex = q;
+ cm->y_dc_delta_q = 0;
+ cm->uv_dc_delta_q = 0;
+ cm->uv_ac_delta_q = 0;
+ if (cpi->oxcf.delta_q_uv != 0) {
+ cm->uv_dc_delta_q = cm->uv_ac_delta_q = cpi->oxcf.delta_q_uv;
+ vp9_init_quantizer(cpi);
+ }
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int vp9_quantizer_to_qindex(int quantizer) {
+ return quantizer_to_qindex[quantizer];
+}
+
+int vp9_qindex_to_quantizer(int qindex) {
+ int quantizer;
+
+ for (quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h
new file mode 100644
index 0000000000..f626f06566
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_quantize.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_
+#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_
+
+#include "./vpx_config.h"
+#include "vp9/encoder/vp9_block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
+
+ DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+struct VP9_COMP;
+struct VP9Common;
+
+void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
+
+void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
+
+void vp9_init_quantizer(struct VP9_COMP *cpi);
+
+void vp9_set_quantizer(struct VP9_COMP *cm, int q);
+
+int vp9_quantizer_to_qindex(int quantizer);
+
+int vp9_qindex_to_quantizer(int qindex);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
new file mode 100644
index 0000000000..13b43aa63a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -0,0 +1,3339 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 4000000
+
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case VPX_BITS_8: name = name##_8; break; \
+ case VPX_BITS_10: name = name##_10; break; \
+ default: \
+ assert(bit_depth == VPX_BITS_12); \
+ name = name##_12; \
+ break; \
+ } \
+ } while (0)
+#else
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ (void)bit_depth; \
+ name = name##_8; \
+ } while (0)
+#endif
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+#endif
+
+#ifdef AGGRESSIVE_VBR
+static int gf_high = 2400;
+static int gf_low = 400;
+static int kf_high = 4000;
+static int kf_low = 400;
+#else
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 4800;
+static int kf_low = 300;
+#endif
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+ vpx_bit_depth_t bit_depth) {
+ int i;
+ const double minqtarget = VPXMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+ // Special case handling to deal with the step from q2.0
+ // down to lossless mode represented by q 1.0.
+ if (minqtarget <= 2.0) return 0;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth)) return i;
+ }
+
+ return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+ int *arfgf_high, int *inter, int *rtc,
+ vpx_bit_depth_t bit_depth) {
+ int i;
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
+ kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+#ifdef AGGRESSIVE_VBR
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth);
+#else
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+#endif
+ arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+ }
+}
+
+void vp9_rc_init_minq_luts(void) {
+ init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+ arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+ inter_minq_8, rtc_minq_8, VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+ init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+ arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+ inter_minq_10, rtc_minq_10, VPX_BITS_10);
+ init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+ arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+ inter_minq_12, rtc_minq_12, VPX_BITS_12);
+#endif
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
+// Convert the index to a real Q value (scaled down to match old Q values)
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth) {
+ case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+ case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
+ default:
+ assert(bit_depth == VPX_BITS_12);
+ return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
+ }
+#else
+ return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
+#endif
+}
+
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth) {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; ++i)
+ if (vp9_convert_qindex_to_q(i, bit_depth) >= q_val) break;
+
+ if (i == QINDEX_RANGE) i--;
+
+ return i;
+}
+
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, vpx_bit_depth_t bit_depth) {
+ const double q = vp9_convert_qindex_to_q(qindex, bit_depth);
+ int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+ assert(correction_factor <= MAX_BPB_FACTOR &&
+ correction_factor >= MIN_BPB_FACTOR);
+
+ // q based adjustment to baseline enumerator
+ enumerator += (int)(enumerator * q) >> 12;
+ return (int)(enumerator * correction_factor / q);
+}
+
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+ double correction_factor,
+ vpx_bit_depth_t bit_depth) {
+ const int bpm =
+ (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+ return VPXMAX(FRAME_OVERHEAD_BITS,
+ (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
+}
+
+int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+
+ const int min_frame_target =
+ VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ if (target < min_frame_target) target = min_frame_target;
+ if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ }
+
+ // Clip the frame target to the maximum allowed value.
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int64_t max_rate =
+ (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ // target is of type int and VPXMIN cannot evaluate to larger than target
+ target = (int)VPXMIN(target, max_rate);
+ }
+ return target;
+}
+
+int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ if (oxcf->rc_max_intra_bitrate_pct) {
+ const int64_t max_rate =
+ (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+ target = (int)VPXMIN(target, max_rate);
+ }
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ return target;
+}
+
+// TODO(marpan/jianj): bits_off_target and buffer_level are used in the same
+// way for CBR mode, for the buffering updates below. Look into removing one
+// of these (i.e., bits_off_target).
+// Update the buffer level before encoding with the per-frame-bandwidth,
+void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->bits_off_target += rc->avg_frame_bandwidth;
+ // Clip the buffer level to the maximum specified buffer size.
+ rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = rc->bits_off_target;
+}
+
+// Update the buffer level before encoding with the per-frame-bandwidth
+// for SVC. The current and all upper temporal layers are updated, needed
+// for the layered rate control which involves cumulative buffer levels for
+// the temporal layers. Allow for using the timestamp(pts) delta for the
+// framerate when the set_ref_frame_config is used.
+static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+ SVC *const svc = &cpi->svc;
+ int i;
+ // Set this to 1 to use timestamp delta for "framerate" under
+ // ref_frame_config usage.
+ int use_timestamp = 1;
+ const int64_t ts_delta =
+ svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id];
+ for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) {
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ if (use_timestamp && cpi->svc.use_set_ref_frame_config &&
+ svc->number_temporal_layers == 1 && ts_delta > 0 &&
+ svc->current_superframe > 0) {
+ // TODO(marpan): This may need to be modified for temporal layers.
+ const double framerate_pts = 10000000.0 / ts_delta;
+ lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts);
+ } else {
+ lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate);
+ }
+ // Clip buffer level to maximum buffer size for the layer.
+ lrc->bits_off_target =
+ VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+ lrc->buffer_level = lrc->bits_off_target;
+ if (i == svc->temporal_layer_id) {
+ cpi->rc.bits_off_target = lrc->bits_off_target;
+ cpi->rc.buffer_level = lrc->buffer_level;
+ }
+ }
+}
+
+// Update the buffer level for higher temporal layers, given the encoded current
+// temporal layer.
+static void update_layer_buffer_level_postencode(SVC *svc,
+ int encoded_frame_size) {
+ int i = 0;
+ const int current_temporal_layer = svc->temporal_layer_id;
+ for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) {
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->bits_off_target -= encoded_frame_size;
+ // Clip buffer level to maximum buffer size for the layer.
+ lrc->bits_off_target =
+ VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+ lrc->buffer_level = lrc->bits_off_target;
+ }
+}
+
+// Update the buffer level after encoding with encoded frame size.
+static void update_buffer_level_postencode(VP9_COMP *cpi,
+ int encoded_frame_size) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->bits_off_target -= encoded_frame_size;
+ // Clip the buffer level to the maximum specified buffer size.
+ rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ // For screen-content mode, and if frame-dropper is off, don't let buffer
+ // level go below threshold, given here as -rc->maximum_ buffer_size.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+ cpi->oxcf.drop_frames_water_mark == 0)
+ rc->bits_off_target = VPXMAX(rc->bits_off_target, -rc->maximum_buffer_size);
+
+ rc->buffer_level = rc->bits_off_target;
+
+ if (is_one_pass_svc(cpi)) {
+ update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
+ }
+}
+
+int vp9_rc_get_default_min_gf_interval(int width, int height,
+ double framerate) {
+ // Assume we do not need any constraint lower than 4K 20 fps
+ static const double factor_safe = 3840 * 2160 * 20.0;
+ const double factor = width * height * framerate;
+ const int default_interval =
+ clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+ if (factor <= factor_safe)
+ return default_interval;
+ else
+ return VPXMAX(default_interval,
+ (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+ // Note this logic makes:
+ // 4K24: 5
+ // 4K30: 6
+ // 4K60: 12
+}
+
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+ int interval = VPXMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+ interval += (interval & 0x01); // Round to even value
+ return VPXMAX(interval, min_gf_interval);
+}
+
+void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+ int i;
+
+ if (pass == 0 && oxcf->rc_mode == VPX_CBR) {
+ rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+ rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+ } else {
+ rc->avg_frame_qindex[KEY_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ }
+
+ rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+ rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+ rc->buffer_level = rc->starting_buffer_level;
+ rc->bits_off_target = rc->starting_buffer_level;
+
+ rc->rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+ rc->total_actual_bits = 0;
+ rc->total_target_bits = 0;
+ rc->total_target_vs_actual = 0;
+ rc->avg_frame_low_motion = 0;
+ rc->count_last_scene_change = 0;
+ rc->af_ratio_onepass_vbr = 10;
+ rc->prev_avg_source_sad_lag = 0;
+ rc->high_source_sad = 0;
+ rc->reset_high_source_sad = 0;
+ rc->high_source_sad_lagindex = -1;
+ rc->high_num_blocks_with_motion = 0;
+ rc->hybrid_intra_scene_change = 0;
+ rc->re_encode_maxq_scene_change = 0;
+ rc->alt_ref_gf_group = 0;
+ rc->last_frame_is_src_altref = 0;
+ rc->fac_active_worst_inter = 150;
+ rc->fac_active_worst_gf = 100;
+ rc->force_qpmin = 0;
+ for (i = 0; i < MAX_LAG_BUFFERS; ++i) rc->avg_source_sad[i] = 0;
+ rc->frames_to_key = 0;
+ rc->frames_since_key = 8; // Sensible default for first frame.
+ rc->this_key_frame_forced = 0;
+ rc->next_key_frame_forced = 0;
+ rc->source_alt_ref_pending = 0;
+ rc->source_alt_ref_active = 0;
+
+ rc->frames_till_gf_update_due = 0;
+ rc->constrain_gf_key_freq_onepass_vbr = 1;
+ rc->ni_av_qi = oxcf->worst_allowed_q;
+ rc->ni_tot_qi = 0;
+ rc->ni_frames = 0;
+
+ rc->tot_q = 0.0;
+ rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+ for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ rc->rate_correction_factors[i] = 1.0;
+ rc->damped_adjustment[i] = 0;
+ }
+
+ rc->min_gf_interval = oxcf->min_gf_interval;
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, oxcf->init_framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+ oxcf->init_framerate, rc->min_gf_interval);
+ rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+ rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+ }
+
+ rc->force_max_q = 0;
+ rc->last_post_encode_dropped_scene_change = 0;
+ rc->use_post_encode_drop = 0;
+ rc->ext_use_post_encode_drop = 0;
+ rc->disable_overshoot_maxq_cbr = 0;
+ rc->arf_active_best_quality_adjustment_factor = 1.0;
+ rc->arf_increase_active_best_quality = 0;
+ rc->preserve_arf_as_gld = 0;
+ rc->preserve_next_arf_as_gld = 0;
+ rc->show_arf_as_gld = 0;
+}
+
+static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) {
+ SVC *svc = &cpi->svc;
+ if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ return (rc->buffer_level > drop_mark);
+ } else {
+ int i;
+ // For SVC in the FULL_SUPERFRAME_DROP): the condition on
+ // buffer (if its above threshold, so no drop) is checked on current and
+ // upper spatial layers. If any spatial layer is not above threshold then
+ // we return 0.
+ for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ // Exclude check for layer whose bitrate is 0.
+ if (lc->target_bandwidth > 0) {
+ const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+ lrc->optimal_buffer_level / 100);
+ if (!(lrc->buffer_level > drop_mark_layer)) return 0;
+ }
+ }
+ return 1;
+ }
+}
+
+static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) {
+ SVC *svc = &cpi->svc;
+ if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ return (rc->buffer_level <= drop_mark);
+ } else {
+ int i;
+ // For SVC in the constrained framedrop mode (svc->framedrop_mode =
+ // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on
+ // buffer (if its below threshold, so drop frame) is checked on current
+ // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any
+ // spatial layer is <= threshold, then we return 1 (drop).
+ for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ // Exclude check for layer whose bitrate is 0.
+ if (lc->target_bandwidth > 0) {
+ const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+ lrc->optimal_buffer_level / 100);
+ if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) {
+ if (lrc->buffer_level <= drop_mark_layer) return 1;
+ } else {
+ if (!(lrc->buffer_level <= drop_mark_layer)) return 0;
+ }
+ }
+ }
+ if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP)
+ return 0;
+ else
+ return 1;
+ }
+}
+
+int vp9_test_drop(VP9_COMP *cpi) {
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SVC *svc = &cpi->svc;
+ int drop_frames_water_mark = oxcf->drop_frames_water_mark;
+ if (cpi->use_svc) {
+ // If we have dropped max_consec_drop frames, then we don't
+ // drop this spatial layer, and reset counter to 0.
+ if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) {
+ svc->drop_count[svc->spatial_layer_id] = 0;
+ return 0;
+ } else {
+ drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id];
+ }
+ }
+ if (!drop_frames_water_mark ||
+ (svc->spatial_layer_id > 0 &&
+ svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
+ return 0;
+ } else {
+ if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) ||
+ (check_buffer_below_thresh(cpi, -1) &&
+ svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
+ // Always drop if buffer is below 0.
+ return 1;
+ } else {
+ // If buffer is below drop_mark, for now just drop every other frame
+ // (starting with the next frame) until it increases back over drop_mark.
+ int drop_mark =
+ (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100);
+ if (check_buffer_above_thresh(cpi, drop_mark) &&
+ (rc->decimation_factor > 0)) {
+ --rc->decimation_factor;
+ } else if (check_buffer_below_thresh(cpi, drop_mark) &&
+ rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
+ }
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
+ return 1;
+ } else {
+ rc->decimation_count = rc->decimation_factor;
+ return 0;
+ }
+ } else {
+ rc->decimation_count = 0;
+ return 0;
+ }
+ }
+ }
+}
+
+int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) {
+ size_t frame_size = *size << 3;
+ int64_t new_buffer_level =
+ cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size;
+
+ // For now we drop if new buffer level (given the encoded frame size) goes
+ // below 0.
+ if (new_buffer_level < 0) {
+ *size = 0;
+ vp9_rc_postencode_update_drop_frame(cpi);
+ // Update flag to use for next frame.
+ if (cpi->rc.high_source_sad ||
+ (cpi->use_svc && cpi->svc.high_source_sad_superframe))
+ cpi->rc.last_post_encode_dropped_scene_change = 1;
+ // Force max_q on next fame.
+ cpi->rc.force_max_q = 1;
+ cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+ cpi->last_frame_dropped = 1;
+ cpi->ext_refresh_frame_flags_pending = 0;
+ if (cpi->use_svc) {
+ SVC *svc = &cpi->svc;
+ int sl = 0;
+ int tl = 0;
+ svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+ svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+ svc->drop_count[svc->spatial_layer_id]++;
+ svc->skip_enhancement_layer = 1;
+ // Postencode drop is only checked on base spatial layer,
+ // for now if max-q is set on base we force it on all layers.
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ const int layer =
+ LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->force_max_q = 1;
+ lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+ }
+ }
+ }
+ return 1;
+ }
+
+ cpi->rc.force_max_q = 0;
+ cpi->rc.last_post_encode_dropped_scene_change = 0;
+ return 0;
+}
+
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+ SVC *svc = &cpi->svc;
+ int svc_prev_layer_dropped = 0;
+ // In the constrained or full_superframe framedrop mode for svc
+ // (framedrop_mode != (LAYER_DROP && CONSTRAINED_FROM_ABOVE)),
+ // if the previous spatial layer was dropped, drop the current spatial layer.
+ if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+ svc->drop_spatial_layer[svc->spatial_layer_id - 1])
+ svc_prev_layer_dropped = 1;
+ if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP &&
+ svc->framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP) ||
+ svc->force_drop_constrained_from_above[svc->spatial_layer_id] ||
+ vp9_test_drop(cpi)) {
+ vp9_rc_postencode_update_drop_frame(cpi);
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->last_frame_dropped = 1;
+ if (cpi->use_svc) {
+ svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+ svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+ svc->drop_count[svc->spatial_layer_id]++;
+ svc->skip_enhancement_layer = 1;
+ if (svc->framedrop_mode == LAYER_DROP ||
+ (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP &&
+ svc->force_drop_constrained_from_above[svc->number_spatial_layers -
+ 1] == 0) ||
+ svc->drop_spatial_layer[0] == 0) {
+ // For the case of constrained drop mode where full superframe is
+ // dropped, we don't increment the svc frame counters.
+ // In particular temporal layer counter (which is incremented in
+ // vp9_inc_frame_in_layer()) won't be incremented, so on a dropped
+ // frame we try the same temporal_layer_id on next incoming frame.
+ // This is to avoid an issue with temporal alignment with full
+ // superframe dropping.
+ vp9_inc_frame_in_layer(cpi);
+ }
+ if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ int i;
+ int all_layers_drop = 1;
+ for (i = 0; i < svc->spatial_layer_id; i++) {
+ if (svc->drop_spatial_layer[i] == 0) {
+ all_layers_drop = 0;
+ break;
+ }
+ }
+ if (all_layers_drop == 1) svc->skip_enhancement_layer = 0;
+ }
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
+ // This makes sure q is between oscillating Qs to prevent resonance.
+ if (!cpi->rc.reset_high_source_sad &&
+ (!cpi->oxcf.gf_cbr_boost_pct ||
+ !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
+ (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+ cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+ int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+ VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+ // If the previous frame had overshoot and the current q needs to increase
+ // above the clamped value, reduce the clamp for faster reaction to
+ // overshoot.
+ if (cpi->rc.rc_1_frame == -1 && q > qclamp)
+ q = (q + qclamp) >> 1;
+ else
+ q = qclamp;
+ }
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+ vp9_cyclic_refresh_limit_q(cpi, &q);
+ return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
+static double get_rate_correction_factor(const VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_COMMON *const cm = &cpi->common;
+ double rcf;
+
+ if (frame_is_intra_only(cm)) {
+ rcf = rc->rate_correction_factors[KF_STD];
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rcf = rc->rate_correction_factors[rf_lvl];
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
+ rcf = rc->rate_correction_factors[GF_ARF_STD];
+ else
+ rcf = rc->rate_correction_factors[INTER_NORMAL];
+ }
+ rcf *= rcf_mult[rc->frame_size_selector];
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ const VP9_COMMON *const cm = &cpi->common;
+
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+ if (frame_is_intra_only(cm)) {
+ rc->rate_correction_factors[KF_STD] = factor;
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rc->rate_correction_factors[rf_lvl] = factor;
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100))
+ rc->rate_correction_factors[GF_ARF_STD] = factor;
+ else
+ rc->rate_correction_factors[INTER_NORMAL] = factor;
+ }
+}
+
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ int correction_factor = 100;
+ double rate_correction_factor = get_rate_correction_factor(cpi);
+ double adjustment_limit;
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
+ int projected_size_based_on_q = 0;
+
+ // Do not update the rate factors for arf overlay frames.
+ if (cpi->rc.is_src_frame_alt_ref) return;
+
+ // Clear down mmx registers to allow floating point in what follows
+ vpx_clear_system_state();
+
+ // Work out how big we would have expected the frame to be at this Q given
+ // the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+ projected_size_based_on_q =
+ vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
+ projected_size_based_on_q =
+ vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs,
+ rate_correction_factor, cm->bit_depth);
+ }
+ // Work out a size correction factor.
+ if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+ correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+ projected_size_based_on_q);
+
+ // Do not use damped adjustment for the first frame of each frame type
+ if (!cpi->rc.damped_adjustment[rf_lvl]) {
+ adjustment_limit = 1.0;
+ cpi->rc.damped_adjustment[rf_lvl] = 1;
+ } else {
+ // More heavily damped adjustment used if we have been oscillating either
+ // side of target.
+ adjustment_limit =
+ 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+ }
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 110)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 90)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
+
+ // Turn off oscilation detection in the case of massive overshoot.
+ if (cpi->rc.rc_1_frame == -1 && cpi->rc.rc_2_frame == 1 &&
+ correction_factor > 1000) {
+ cpi->rc.rc_2_frame = 0;
+ }
+
+ if (correction_factor > 102) {
+ // We are not already at the worst allowable quality
+ correction_factor =
+ (int)(100 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ } else if (correction_factor < 99) {
+ // We are not already at the best allowable quality
+ correction_factor =
+ (int)(100 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ set_rate_correction_factor(cpi, rate_correction_factor);
+}
+
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality) {
+ const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int q = active_worst_quality;
+ int last_error = INT_MAX;
+ int i, target_bits_per_mb, bits_per_mb_at_this_q;
+ const double correction_factor = get_rate_correction_factor(cpi);
+
+ // Calculate required scaling factor based on target frame size and size of
+ // frame produced using previous Q.
+ target_bits_per_mb =
+ (int)(((uint64_t)target_bits_per_frame << BPER_MB_NORMBITS) / cm->MBs);
+
+ i = active_best_quality;
+
+ do {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cr->apply_cyclic_refresh &&
+ (!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
+ bits_per_mb_at_this_q =
+ (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+ } else {
+ FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
+ bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(
+ frame_type, i, correction_factor, cm->bit_depth);
+ }
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ q = i;
+ else
+ q = i - 1;
+
+ break;
+ } else {
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ }
+ } while (++i <= active_worst_quality);
+
+ // Adjustment to q for CBR mode.
+ if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q);
+
+ return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+ int *low_motion_minq, int *high_motion_minq) {
+ if (gfu_boost > high) {
+ return low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ return high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ return low_motion_minq[q] + adjustment;
+ }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+ vpx_bit_depth_t bit_depth) {
+ int *kf_low_motion_minq;
+ int *kf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+ return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+ kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const VP9_COMP *const cpi, int q,
+ vpx_bit_depth_t bit_depth) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ int *arfgf_low_motion_minq;
+ int *arfgf_high_motion_minq;
+ const int gfu_boost = cpi->multi_layer_arf
+ ? gf_group->gfu_boost[gf_group->index]
+ : rc->gfu_boost;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return get_active_quality(q, gfu_boost, gf_low, gf_high,
+ arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const unsigned int curr_frame = cpi->common.current_video_frame;
+ int active_worst_quality;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ active_worst_quality =
+ curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
+ } else {
+ if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ active_worst_quality =
+ curr_frame == 1
+ ? rc->last_q[KEY_FRAME] * 5 >> 2
+ : rc->last_q[INTER_FRAME] * rc->fac_active_worst_gf / 100;
+ } else {
+ active_worst_quality = curr_frame == 1
+ ? rc->last_q[KEY_FRAME] << 1
+ : rc->avg_frame_qindex[INTER_FRAME] *
+ rc->fac_active_worst_inter / 100;
+ }
+ }
+ return VPXMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
+ // Adjust active_worst_quality: If buffer is above the optimal/target level,
+ // bring active_worst_quality down depending on fullness of buffer.
+ // If buffer is below the optimal level, let the active_worst_quality go from
+ // ambient Q (at buffer = optimal level) to worst_quality level
+ // (at buffer = critical level).
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *rc = &cpi->rc;
+ // Buffer level below which we push active_worst to worst_quality.
+ int64_t critical_level = rc->optimal_buffer_level >> 3;
+ int64_t buff_lvl_step = 0;
+ int adjustment = 0;
+ int active_worst_quality;
+ int ambient_qp;
+ unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
+ if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q)
+ return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting.
+ ambient_qp = (cm->current_video_frame < num_frames_weight_key)
+ ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
+ rc->avg_frame_qindex[KEY_FRAME])
+ : rc->avg_frame_qindex[INTER_FRAME];
+ active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2);
+ // For SVC if the current base spatial layer was key frame, use the QP from
+ // that base layer for ambient_qp.
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+ int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ if (lc->is_key_frame) {
+ const RATE_CONTROL *lrc = &lc->rc;
+ ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]);
+ active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3);
+ }
+ }
+ if (rc->buffer_level > rc->optimal_buffer_level) {
+ // Adjust down.
+ // Maximum limit for down adjustment ~30%; make it lower for screen content.
+ int max_adjustment_down = active_worst_quality / 3;
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+ max_adjustment_down = active_worst_quality >> 3;
+ if (max_adjustment_down) {
+ buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+ max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+ buff_lvl_step);
+ active_worst_quality -= adjustment;
+ }
+ } else if (rc->buffer_level > critical_level) {
+ // Adjust up from ambient Q.
+ if (critical_level) {
+ buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+ if (buff_lvl_step) {
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (rc->optimal_buffer_level - rc->buffer_level) /
+ buff_lvl_step);
+ }
+ active_worst_quality = ambient_qp + adjustment;
+ }
+ } else {
+ // Set to worst_quality if buffer is below critical level.
+ active_worst_quality = rc->worst_quality;
+ }
+ return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
+ int *bottom_index,
+ int *top_index) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ int q;
+ int *rtc_minq;
+ ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq);
+
+ if (frame_is_intra_only(cm)) {
+ active_best_quality = rc->best_quality;
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (rc->this_key_frame_forced) {
+ int qindex = rc->last_boosted_qindex;
+ double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex = vp9_compute_qdelta(
+ rc, last_boosted_q, (last_boosted_q * 0.75), cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (cm->current_video_frame > 0) {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ active_best_quality = get_kf_active_quality(
+ rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+ cpi->oxcf.gf_cbr_boost_pct &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ if (cm->current_video_frame > 1) {
+ if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ } else {
+ if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Special case code to try and match quality with forced key frames
+ if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static int get_active_cq_level_one_pass(const RATE_CONTROL *rc,
+ const VP9EncoderConfig *const oxcf) {
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = oxcf->cq_level;
+ if (oxcf->rc_mode == VPX_CQ && rc->total_target_bits > 0) {
+ const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ return active_cq_level;
+}
+
+#define SMOOTH_PCT_MIN 0.1
+#define SMOOTH_PCT_DIV 0.05
+static int get_active_cq_level_two_pass(const TWO_PASS *twopass,
+ const RATE_CONTROL *rc,
+ const VP9EncoderConfig *const oxcf) {
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = oxcf->cq_level;
+ if (oxcf->rc_mode == VPX_CQ) {
+ if (twopass->mb_smooth_pct > SMOOTH_PCT_MIN) {
+ active_cq_level -=
+ (int)((twopass->mb_smooth_pct - SMOOTH_PCT_MIN) / SMOOTH_PCT_DIV);
+ active_cq_level = VPXMAX(active_cq_level, 0);
+ }
+ if (rc->total_target_bits > 0) {
+ const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ }
+ return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
+ int *bottom_index,
+ int *top_index) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level = get_active_cq_level_one_pass(rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+ int q;
+ int *inter_minq;
+ ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ if (oxcf->rc_mode == VPX_Q) {
+ int qindex = cq_level;
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (rc->this_key_frame_forced) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ int qindex = rc->last_boosted_qindex;
+ double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex = vp9_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.75, cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ active_best_quality = get_kf_active_quality(
+ rc, rc->avg_frame_qindex[KEY_FRAME], cm->bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1) {
+ if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ } else {
+ q = rc->avg_frame_qindex[KEY_FRAME];
+ }
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == VPX_CQ) {
+ if (q < cq_level) q = cq_level;
+
+ active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+
+ } else if (oxcf->rc_mode == VPX_Q) {
+ int qindex = cq_level;
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ int delta_qindex;
+ if (cpi->refresh_alt_ref_frame)
+ delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
+ else
+ delta_qindex =
+ vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+ }
+ } else {
+ if (oxcf->rc_mode == VPX_Q) {
+ int qindex = cq_level;
+ double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+ 0.70, 1.0, 0.85, 1.0 };
+ int delta_qindex = vp9_compute_qdelta(
+ rc, qstart,
+ qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+ cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // Use the min of the average Q and active_worst_quality as basis for
+ // active_best.
+ if (cm->current_video_frame > 1) {
+ q = VPXMIN(rc->avg_frame_qindex[INTER_FRAME], active_worst_quality);
+ active_best_quality = inter_minq[q];
+ } else {
+ active_best_quality = inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ }
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+ {
+ int qdelta = 0;
+ vpx_clear_system_state();
+
+ // Limit Q range for the adaptive loop.
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+ !(cm->current_video_frame == 0)) {
+ qdelta = vp9_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ qdelta = vp9_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
+ }
+ if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0;
+ *top_index = active_worst_quality + qdelta;
+ *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+ }
+#endif
+
+ if (oxcf->rc_mode == VPX_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames
+ } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
+ static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+ 1.00, // INTER_NORMAL
+ 1.00, // INTER_HIGH
+ 1.50, // GF_ARF_LOW
+ 1.75, // GF_ARF_STD
+ 2.00, // KF_STD
+ };
+ const VP9_COMMON *const cm = &cpi->common;
+
+ int qdelta = vp9_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth);
+ return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+
+static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
+ int *top_index) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int active_best_quality;
+ int active_worst_quality = cpi->twopass.active_worst_quality;
+
+ if (rc->this_key_frame_forced) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ double last_boosted_q;
+ int delta_qindex;
+ int qindex;
+
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ active_best_quality = qindex;
+ last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 1.25, cm->bit_depth);
+ active_worst_quality =
+ VPXMIN(qindex + delta_qindex, active_worst_quality);
+ } else {
+ qindex = rc->last_boosted_qindex;
+ last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.75, cm->bit_depth);
+ active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+ }
+ } else {
+ // Not forced keyframe.
+ double q_adj_factor = 1.0;
+ double q_val;
+ // Baseline value derived from cpi->active_worst_quality and kf boost.
+ active_best_quality =
+ get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+ if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+ active_best_quality /= 4;
+ }
+
+ // Dont allow the active min to be lossless (q0) unlesss the max q
+ // already indicates lossless.
+ active_best_quality =
+ VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((cm->width * cm->height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Make a further adjustment based on the kf zero motion measure.
+ q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+ active_best_quality +=
+ vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+ }
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+}
+
+static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
+ int gf_group_index) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ const int is_intra_frame = frame_is_intra_only(cm);
+
+ const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
+
+ int q = cq_level;
+ int active_best_quality = cq_level;
+ int active_worst_quality = cq_level;
+
+ // Key frame qp decision
+ if (is_intra_frame && rc->frames_to_key > 1)
+ pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+
+ // ARF / GF qp decision
+ if (!is_intra_frame && !rc->is_src_frame_alt_ref &&
+ cpi->refresh_alt_ref_frame) {
+ active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+
+ // Modify best quality for second level arfs. For mode VPX_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+ const int layer_depth = gf_group->layer_depth[gf_group_index];
+ // linearly fit the frame q depending on the layer depth index from
+ // the base layer ARF.
+ active_best_quality = ((layer_depth - 1) * cq_level +
+ active_best_quality + layer_depth / 2) /
+ layer_depth;
+ }
+ }
+
+ q = active_best_quality;
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+ return q;
+}
+
+static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+ int *top_index, int gf_group_index) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = cpi->twopass.active_worst_quality;
+ int q;
+ int *inter_minq;
+ int arf_active_best_quality_hl;
+ int *arfgf_high_motion_minq, *arfgf_low_motion_minq;
+ const int boost_frame =
+ !rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
+ ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+ if (oxcf->rc_mode == VPX_Q)
+ return rc_constant_q(cpi, bottom_index, top_index, gf_group_index);
+
+ if (frame_is_intra_only(cm)) {
+ pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+ } else if (boost_frame) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == VPX_CQ) {
+ if (q < cq_level) q = cq_level;
+ }
+ active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+ arf_active_best_quality_hl = active_best_quality;
+
+ if (rc->arf_increase_active_best_quality == 1) {
+ ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq);
+ arf_active_best_quality_hl = arfgf_high_motion_minq[q];
+ } else if (rc->arf_increase_active_best_quality == -1) {
+ ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_low_motion_minq);
+ arf_active_best_quality_hl = arfgf_low_motion_minq[q];
+ }
+ active_best_quality =
+ (int)((double)active_best_quality *
+ rc->arf_active_best_quality_adjustment_factor +
+ (double)arf_active_best_quality_hl *
+ (1.0 - rc->arf_active_best_quality_adjustment_factor));
+
+ // Modify best quality for second level arfs. For mode VPX_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+ const int layer_depth = gf_group->layer_depth[gf_group_index];
+ // linearly fit the frame q depending on the layer depth index from
+ // the base layer ARF.
+ active_best_quality =
+ ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) /
+ layer_depth;
+ }
+ } else {
+ active_best_quality = inter_minq[active_worst_quality];
+
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+
+ // Extension to max or min Q if undershoot or overshoot is outside
+ // the permitted range.
+ if (frame_is_intra_only(cm) || boost_frame) {
+ const int layer_depth = gf_group->layer_depth[gf_group_index];
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+ active_worst_quality += (cpi->twopass.extend_maxq / 2);
+
+ if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+ assert(layer_depth > 1);
+ active_best_quality =
+ VPXMAX(active_best_quality,
+ cpi->twopass.last_qindex_of_arf_layer[layer_depth - 1]);
+ }
+ } else {
+ const int max_layer_depth = gf_group->max_layer_depth;
+ assert(max_layer_depth > 0);
+
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+ active_worst_quality += cpi->twopass.extend_maxq;
+
+ // For normal frames do not allow an active minq lower than the q used for
+ // the last boosted frame.
+ active_best_quality =
+ VPXMAX(active_best_quality,
+ cpi->twopass.last_qindex_of_arf_layer[max_layer_depth - 1]);
+ }
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+ vpx_clear_system_state();
+ // Static forced key frames Q restrictions dealt with elsewhere.
+ if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced ||
+ cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) {
+ int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index],
+ active_worst_quality);
+ active_worst_quality =
+ VPXMAX(active_worst_quality + qdelta, active_best_quality);
+ }
+#endif
+
+ // Modify active_best_quality for downscaled normal frames.
+ if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = vp9_compute_qdelta_by_rate(
+ rc, cm->frame_type, active_best_quality, 2.0, cm->bit_depth);
+ active_best_quality =
+ VPXMAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+ // If static since last kf use better of last boosted and last kf q.
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ } else {
+ q = rc->last_boosted_qindex;
+ }
+ } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) {
+ q = active_best_quality;
+ } else {
+ q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality);
+ if (q > active_worst_quality) {
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ active_worst_quality = q;
+ else
+ q = active_worst_quality;
+ }
+ }
+ clamp(q, active_best_quality, active_worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
+ int *top_index) {
+ int q;
+ const int gf_group_index = cpi->twopass.gf_group.index;
+ if (cpi->oxcf.pass == 0) {
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
+ else
+ q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
+ } else {
+ q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+ gf_group_index);
+ }
+ if (cpi->sf.use_nonrd_pick_mode) {
+ if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
+
+ if (q < *bottom_index)
+ *bottom_index = q;
+ else if (q > *top_index)
+ *top_index = q;
+ }
+ return q;
+}
+
+void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
+ VP9_COMMON *cm = &cpi->common;
+ TWO_PASS *const twopass = &cpi->twopass;
+
+ cpi->rc.is_src_frame_alt_ref = 0;
+ cm->show_existing_frame = 0;
+ cpi->rc.show_arf_as_gld = 0;
+ switch (twopass->gf_group.update_type[gf_group_index]) {
+ case KF_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+ case LF_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+ case GF_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+ case OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ if (cpi->rc.preserve_arf_as_gld) {
+ cpi->rc.show_arf_as_gld = 1;
+ cpi->refresh_golden_frame = 0;
+ cm->show_existing_frame = 1;
+ cm->refresh_frame_context = 0;
+ }
+ break;
+ case MID_OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+ case USE_BUF_FRAME:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cm->show_existing_frame = 1;
+ cm->refresh_frame_context = 0;
+ break;
+ default:
+ assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE);
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+ }
+}
+
+void vp9_estimate_qp_gop(VP9_COMP *cpi) {
+ int gop_length = cpi->twopass.gf_group.gf_group_size;
+ int bottom_index, top_index;
+ int idx;
+ const int gf_index = cpi->twopass.gf_group.index;
+ const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+ const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+ for (idx = 1; idx <= gop_length; ++idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+ int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+ cpi->twopass.gf_group.index = idx;
+ vp9_rc_set_frame_target(cpi, target_rate);
+ vp9_configure_buffer_updates(cpi, idx);
+ tpl_frame->base_qindex =
+ rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
+ tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+ }
+ // Reset the actual index and frame update
+ cpi->twopass.gf_group.index = gf_index;
+ cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+ cpi->common.refresh_frame_context = refresh_frame_context;
+ vp9_configure_buffer_updates(cpi, gf_index);
+}
+
+void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ if (cpi->oxcf.rc_mode == VPX_Q) {
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ // For very small rate targets where the fractional adjustment
+ // may be tiny make sure there is at least a minimum range.
+ const int tol_low =
+ (int)(((int64_t)cpi->sf.recode_tolerance_low * frame_target) / 100);
+ const int tol_high =
+ (int)(((int64_t)cpi->sf.recode_tolerance_high * frame_target) / 100);
+ *frame_under_shoot_limit = VPXMAX(frame_target - tol_low - 100, 0);
+ *frame_over_shoot_limit =
+ VPXMIN(frame_target + tol_high + 100, cpi->rc.max_frame_bandwidth);
+ }
+}
+
+void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
+ const VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ rc->this_frame_target = target;
+
+ // Modify frame size target when down-scaling.
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+ rc->frame_size_selector != UNSCALED) {
+ rc->this_frame_target = (int)(rc->this_frame_target *
+ rate_thresh_mult[rc->frame_size_selector]);
+ }
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ if (cpi->encode_command.use_external_target_frame_bits) {
+ rc->this_frame_target = cpi->encode_command.target_frame_bits;
+ }
+ }
+#endif // CONFIG_RATE_CTRL
+
+ // Target rate per SB64 (including partial SB64s.
+ rc->sb64_target_rate = (int)(((int64_t)rc->this_frame_target * 64 * 64) /
+ (cm->width * cm->height));
+}
+
+static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
+ // this frame refreshes means next frames don't unless specified by user
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->frames_since_golden = 0;
+
+ // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+ rc->source_alt_ref_pending = 0;
+
+ // Set the alternate reference frame active flag
+ rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Update the Golden frame usage counts.
+ if (cpi->refresh_golden_frame) {
+ // this frame refreshes means next frames don't unless specified by user
+ rc->frames_since_golden = 0;
+
+ // If we are not using alt ref in the up and coming group clear the arf
+ // active flag. In multi arf group case, if the index is not 0 then
+ // we are overlaying a mid group arf so should not reset the flag.
+ if (cpi->oxcf.pass == 2) {
+ if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+ rc->source_alt_ref_active = 0;
+ } else if (!rc->source_alt_ref_pending) {
+ rc->source_alt_ref_active = 0;
+ }
+
+ // Decrement count down till next gf
+ if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+ } else if (!cpi->refresh_alt_ref_frame) {
+ // Decrement count down till next gf
+ if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+
+ rc->frames_since_golden++;
+
+ if (rc->show_arf_as_gld) {
+ rc->frames_since_golden = 0;
+ // If we are not using alt ref in the up and coming group clear the arf
+ // active flag. In multi arf group case, if the index is not 0 then
+ // we are overlaying a mid group arf so should not reset the flag.
+ if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+ rc->source_alt_ref_active = 0;
+ }
+ }
+}
+
+static void update_altref_usage(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ int sum_ref_frame_usage = 0;
+ int arf_frame_usage = 0;
+ int mi_row, mi_col;
+ if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+ !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) {
+ int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+ sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] +
+ cpi->count_lastgolden_frame_usage[sboffset];
+ arf_frame_usage += cpi->count_arf_frame_usage[sboffset];
+ }
+ }
+ if (sum_ref_frame_usage > 0) {
+ double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage;
+ cpi->rc.perc_arf_usage =
+ 0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count;
+ }
+}
+
+void vp9_compute_frame_low_motion(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ int mi_row, mi_col;
+ MODE_INFO **mi = cm->mi_grid_visible;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int rows = cm->mi_rows, cols = cm->mi_cols;
+ int cnt_zeromv = 0;
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ if (mi[0]->ref_frame[0] == LAST_FRAME &&
+ abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16)
+ cnt_zeromv++;
+ mi++;
+ }
+ mi += 8;
+ }
+ cnt_zeromv = 100 * cnt_zeromv / (rows * cols);
+ rc->avg_frame_low_motion = (3 * rc->avg_frame_low_motion + cnt_zeromv) >> 2;
+
+ // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+ // to all lower spatial layers.
+ if (cpi->use_svc && svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+ int i;
+ for (i = 0; i < svc->number_spatial_layers - 1; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+ }
+ }
+}
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SVC *const svc = &cpi->svc;
+ const int qindex = cm->base_qindex;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ const int gf_group_index = cpi->twopass.gf_group.index;
+ const int layer_depth = gf_group->layer_depth[gf_group_index];
+
+ // Update rate control heuristics
+ rc->projected_frame_size = (int)(bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ vp9_rc_update_rate_correction_factors(cpi);
+
+ // Keep a record of last Q and ambient average Q.
+ if (frame_is_intra_only(cm)) {
+ rc->last_q[KEY_FRAME] = qindex;
+ rc->avg_frame_qindex[KEY_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+ if (cpi->use_svc) {
+ int i;
+ for (i = 0; i < svc->number_temporal_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->last_q[KEY_FRAME] = rc->last_q[KEY_FRAME];
+ lrc->avg_frame_qindex[KEY_FRAME] = rc->avg_frame_qindex[KEY_FRAME];
+ }
+ }
+ } else {
+ if ((cpi->use_svc) ||
+ (!rc->is_src_frame_alt_ref &&
+ !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
+ rc->last_q[INTER_FRAME] = qindex;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+ rc->ni_frames++;
+ rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+ rc->avg_q = rc->tot_q / rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+ }
+ }
+
+ if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi);
+
+ // Keep record of last boosted (KF/KF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
+ (!rc->constrained_gf_group &&
+ (cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+ rc->last_boosted_qindex = qindex;
+ }
+
+ if ((qindex < cpi->twopass.last_qindex_of_arf_layer[layer_depth]) ||
+ (cm->frame_type == KEY_FRAME) ||
+ (!rc->constrained_gf_group &&
+ (cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+ cpi->twopass.last_qindex_of_arf_layer[layer_depth] = qindex;
+ }
+
+ if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex;
+
+ update_buffer_level_postencode(cpi, rc->projected_frame_size);
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (!frame_is_intra_only(cm)) {
+ rc->rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+ (int64_t)rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ rc->rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+ (int64_t)rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ rc->long_rolling_target_bits = (int)ROUND64_POWER_OF_TWO(
+ (int64_t)rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+ rc->long_rolling_actual_bits = (int)ROUND64_POWER_OF_TWO(
+ (int64_t)rc->long_rolling_actual_bits * 31 + rc->projected_frame_size,
+ 5);
+ }
+
+ // Actual bits spent
+ rc->total_actual_bits += rc->projected_frame_size;
+ rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+
+ rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+ if (!cpi->use_svc) {
+ if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+ (!frame_is_intra_only(cm)))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+ }
+
+ // If second (long term) temporal reference is used for SVC,
+ // update the golden frame counter, only for base temporal layer.
+ if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+ svc->temporal_layer_id == 0) {
+ int i = 0;
+ if (cpi->refresh_golden_frame)
+ rc->frames_since_golden = 0;
+ else
+ rc->frames_since_golden++;
+ // Decrement count down till next gf
+ if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+ // Update the frames_since_golden for all upper temporal layers.
+ for (i = 1; i < svc->number_temporal_layers; ++i) {
+ const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+ svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->frames_since_golden = rc->frames_since_golden;
+ }
+ }
+
+ if (frame_is_intra_only(cm)) rc->frames_since_key = 0;
+ if (cm->show_frame) {
+ rc->frames_since_key++;
+ rc->frames_to_key--;
+ }
+
+ // Trigger the resizing of the next frame if it is scaled.
+ if (oxcf->pass != 0) {
+ cpi->resize_pending =
+ rc->next_frame_size_selector != rc->frame_size_selector;
+ rc->frame_size_selector = rc->next_frame_size_selector;
+ }
+
+ if (oxcf->pass == 0) {
+ if (!frame_is_intra_only(cm))
+ if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
+ cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
+ }
+
+ if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0;
+
+ rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+ if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
+ svc->lower_layer_qindex = cm->base_qindex;
+}
+
+void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
+ cpi->common.current_video_frame++;
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+ cpi->rc.last_q[INTER_FRAME] = cpi->common.base_qindex;
+ // For SVC on dropped frame when framedrop_mode != LAYER_DROP:
+ // in this mode the whole superframe may be dropped if only a single layer
+ // has buffer underflow (below threshold). Since this can then lead to
+ // increasing buffer levels/overflow for certain layers even though whole
+ // superframe is dropped, we cap buffer level if its already stable.
+ if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP &&
+ cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) {
+ cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
+ cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
+ }
+}
+
+int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const int af_ratio = rc->af_ratio_onepass_vbr;
+ int64_t target =
+ (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
+ ? ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+ af_ratio) /
+ (rc->baseline_gf_interval + af_ratio - 1)
+ : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+ (rc->baseline_gf_interval + af_ratio - 1);
+ // For SVC: refresh flags are used to define the pattern, so we can't
+ // use that for boosting the target size here.
+ // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+ // For now just use the CBR logic for setting target size.
+ if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+ if (target > INT_MAX) target = INT_MAX;
+ return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
+}
+
+int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
+ static const int kf_ratio = 25;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int target = rc->avg_frame_bandwidth * kf_ratio;
+ return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void adjust_gfint_frame_constraint(VP9_COMP *cpi, int frame_constraint) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->constrained_gf_group = 0;
+ // Reset gf interval to make more equal spacing for frame_constraint.
+ if ((frame_constraint <= 7 * rc->baseline_gf_interval >> 2) &&
+ (frame_constraint > rc->baseline_gf_interval)) {
+ rc->baseline_gf_interval = frame_constraint >> 1;
+ if (rc->baseline_gf_interval < 5)
+ rc->baseline_gf_interval = frame_constraint;
+ rc->constrained_gf_group = 1;
+ } else {
+ // Reset to keep gf_interval <= frame_constraint.
+ if (rc->baseline_gf_interval > frame_constraint) {
+ rc->baseline_gf_interval = frame_constraint;
+ rc->constrained_gf_group = 1;
+ }
+ }
+}
+
+void vp9_set_gf_update_one_pass_vbr(VP9_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ VP9_COMMON *const cm = &cpi->common;
+ if (rc->frames_till_gf_update_due == 0) {
+ double rate_err = 1.0;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) {
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ } else {
+ rc->baseline_gf_interval = VPXMIN(
+ 20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2));
+ }
+ rc->af_ratio_onepass_vbr = 10;
+ if (rc->rolling_target_bits > 0)
+ rate_err =
+ (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+ if (cm->current_video_frame > 30) {
+ if (rc->avg_frame_qindex[INTER_FRAME] > (7 * rc->worst_quality) >> 3 &&
+ rate_err > 3.5) {
+ rc->baseline_gf_interval =
+ VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
+ } else if (rc->avg_frame_low_motion > 0 &&
+ rc->avg_frame_low_motion < 20) {
+ // Decrease gf interval for high motion case.
+ rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1);
+ }
+ // Adjust boost and af_ratio based on avg_frame_low_motion, which
+ // varies between 0 and 100 (stationary, 100% zero/small motion).
+ if (rc->avg_frame_low_motion > 0)
+ rc->gfu_boost =
+ VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+ (rc->avg_frame_low_motion + 100));
+ else if (rc->avg_frame_low_motion == 0 && rate_err > 1.0)
+ rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+ rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
+ }
+ if (rc->constrain_gf_key_freq_onepass_vbr)
+ adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ cpi->refresh_golden_frame = 1;
+ rc->source_alt_ref_pending = 0;
+ rc->alt_ref_gf_group = 0;
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+ rc->source_alt_ref_pending = 1;
+ rc->alt_ref_gf_group = 1;
+ }
+ }
+}
+
+void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0)) {
+ cm->frame_type = KEY_FRAME;
+ rc->this_key_frame_forced =
+ cm->current_video_frame != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ vp9_set_gf_update_one_pass_vbr(cpi);
+ if (cm->frame_type == KEY_FRAME)
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+ else
+ target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+ vp9_rc_set_frame_target(cpi, target);
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0)
+ vp9_cyclic_refresh_update_parameters(cpi);
+}
+
+int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const SVC *const svc = &cpi->svc;
+ const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+ const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+ int min_frame_target =
+ VPXMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+ int target;
+
+ if (oxcf->gf_cbr_boost_pct) {
+ const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+ target = cpi->refresh_golden_frame
+ ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+ af_ratio_pct) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
+ : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
+ if (is_one_pass_svc(cpi)) {
+ // Note that for layers, avg_frame_bandwidth is the cumulative
+ // per-frame-bandwidth. For the target size of this frame, use the
+ // layer average frame size (i.e., non-cumulative per-frame-bw).
+ int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ target = lc->avg_frame_size;
+ min_frame_target = VPXMAX(lc->avg_frame_size >> 4, FRAME_OVERHEAD_BITS);
+ }
+ if (diff > 0) {
+ // Lower the target bandwidth for this frame.
+ const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+ target -= (target * pct_low) / 200;
+ } else if (diff < 0) {
+ // Increase the target bandwidth for this frame.
+ const int pct_high =
+ (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+ target += (target * pct_high) / 200;
+ }
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ target = VPXMIN(target, max_rate);
+ }
+ return VPXMAX(min_frame_target, target);
+}
+
+int vp9_calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const VP9EncoderConfig *oxcf = &cpi->oxcf;
+ const SVC *const svc = &cpi->svc;
+ int target;
+ if (cpi->common.current_video_frame == 0) {
+ target = ((rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX
+ : (int)(rc->starting_buffer_level / 2);
+ } else {
+ int kf_boost = 32;
+ double framerate = cpi->framerate;
+ if (svc->number_temporal_layers > 1 && oxcf->rc_mode == VPX_CBR) {
+ // Use the layer framerate for temporal layers CBR mode.
+ const int layer =
+ LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ framerate = lc->framerate;
+ }
+ kf_boost = VPXMAX(kf_boost, (int)(2 * framerate - 16));
+ if (rc->frames_since_key < framerate / 2) {
+ kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+ }
+ target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+ }
+ return vp9_rc_clamp_iframe_target_size(cpi, target);
+}
+
+static void set_intra_only_frame(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ // Don't allow intra_only frame for bypass/flexible SVC mode, or if number
+ // of spatial layers is 1 or if number of spatial or temporal layers > 3.
+ // Also if intra-only is inserted on very first frame, don't allow if
+ // if number of temporal layers > 1. This is because on intra-only frame
+ // only 3 reference buffers can be updated, but for temporal layers > 1
+ // we generally need to use buffer slots 4 and 5.
+ if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
+ svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
+ svc->number_spatial_layers == 1)
+ return;
+ cm->show_frame = 0;
+ cm->intra_only = 1;
+ cm->frame_type = INTER_FRAME;
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_last_frame = 1;
+ cpi->ext_refresh_golden_frame = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ if (cm->current_video_frame == 0) {
+ cpi->lst_fb_idx = 0;
+ cpi->gld_fb_idx = 1;
+ cpi->alt_fb_idx = 2;
+ } else {
+ int i;
+ int count = 0;
+ cpi->lst_fb_idx = -1;
+ cpi->gld_fb_idx = -1;
+ cpi->alt_fb_idx = -1;
+ svc->update_buffer_slot[0] = 0;
+ // For intra-only frame we need to refresh all slots that were
+ // being used for the base layer (fb_idx_base[i] == 1).
+ // Start with assigning last first, then golden and then alt.
+ for (i = 0; i < REF_FRAMES; ++i) {
+ if (svc->fb_idx_base[i] == 1) {
+ svc->update_buffer_slot[0] |= 1 << i;
+ count++;
+ }
+ if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
+ if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
+ if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
+ }
+ // If golden or alt is not being used for base layer, then set them
+ // to the lst_fb_idx.
+ if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
+ if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 0;
+ cpi->ext_refresh_alt_ref_frame = 0;
+ cpi->ref_frame_flags = 0;
+ }
+ }
+}
+
+void vp9_rc_get_svc_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SVC *const svc = &cpi->svc;
+ int target = rc->avg_frame_bandwidth;
+ int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ if (svc->first_spatial_layer_to_encode)
+ svc->layer_context[svc->temporal_layer_id].is_key_frame = 0;
+ // Periodic key frames is based on the super-frame counter
+ // (svc.current_superframe), also only base spatial layer is key frame.
+ // Key frame is set for any of the following: very first frame, frame flags
+ // indicates key, superframe counter hits key frequency, or (non-intra) sync
+ // flag is set for spatial layer 0.
+ if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
+ (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key &&
+ (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
+ !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
+ (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
+ cm->frame_type = KEY_FRAME;
+ rc->source_alt_ref_active = 0;
+ if (is_one_pass_svc(cpi)) {
+ if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
+ layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+ svc->number_temporal_layers);
+ svc->layer_context[layer].is_key_frame = 1;
+ cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+ // Assumption here is that LAST_FRAME is being updated for a keyframe.
+ // Thus no change in update flags.
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+ }
+ } else {
+ cm->frame_type = INTER_FRAME;
+ if (is_one_pass_svc(cpi)) {
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ // Add condition current_video_frame > 0 for the case where first frame
+ // is intra only followed by overlay/copy frame. In this case we don't
+ // want to reset is_key_frame to 0 on overlay/copy frame.
+ lc->is_key_frame =
+ (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
+ ? 0
+ : svc->layer_context[svc->temporal_layer_id].is_key_frame;
+ if (cpi->oxcf.rc_mode == VPX_CBR) {
+ target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+ } else {
+ double rate_err = 0.0;
+ rc->fac_active_worst_inter = 140;
+ rc->fac_active_worst_gf = 100;
+ if (rc->rolling_target_bits > 0) {
+ rate_err =
+ (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+ if (rate_err < 1.0)
+ rc->fac_active_worst_inter = 120;
+ else if (rate_err > 2.0)
+ // Increase active_worst faster if rate fluctuation is high.
+ rc->fac_active_worst_inter = 160;
+ }
+ target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+ }
+ }
+ }
+
+ if (svc->simulcast_mode) {
+ if (svc->spatial_layer_id > 0 &&
+ svc->layer_context[layer].is_key_frame == 1) {
+ cm->frame_type = KEY_FRAME;
+ cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+ }
+ // Set the buffer idx and refresh flags for key frames in simulcast mode.
+ // Note the buffer slot for long-term reference is set below (line 2255),
+ // and alt_ref is used for that on key frame. So use last and golden for
+ // the other two normal slots.
+ if (cm->frame_type == KEY_FRAME) {
+ if (svc->number_spatial_layers == 2) {
+ if (svc->spatial_layer_id == 0) {
+ cpi->lst_fb_idx = 0;
+ cpi->gld_fb_idx = 2;
+ cpi->alt_fb_idx = 6;
+ } else if (svc->spatial_layer_id == 1) {
+ cpi->lst_fb_idx = 1;
+ cpi->gld_fb_idx = 3;
+ cpi->alt_fb_idx = 6;
+ }
+ } else if (svc->number_spatial_layers == 3) {
+ if (svc->spatial_layer_id == 0) {
+ cpi->lst_fb_idx = 0;
+ cpi->gld_fb_idx = 3;
+ cpi->alt_fb_idx = 6;
+ } else if (svc->spatial_layer_id == 1) {
+ cpi->lst_fb_idx = 1;
+ cpi->gld_fb_idx = 4;
+ cpi->alt_fb_idx = 6;
+ } else if (svc->spatial_layer_id == 2) {
+ cpi->lst_fb_idx = 2;
+ cpi->gld_fb_idx = 5;
+ cpi->alt_fb_idx = 7;
+ }
+ }
+ cpi->ext_refresh_last_frame = 1;
+ cpi->ext_refresh_golden_frame = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ }
+ }
+
+ // Check if superframe contains a sync layer request.
+ vp9_svc_check_spatial_layer_sync(cpi);
+
+ // If long term termporal feature is enabled, set the period of the update.
+ // The update/refresh of this reference frame is always on base temporal
+ // layer frame.
+ if (svc->use_gf_temporal_ref_current_layer) {
+ // Only use gf long-term prediction on non-key superframes.
+ if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ // Use golden for this reference, which will be used for prediction.
+ int index = svc->spatial_layer_id;
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ // Enable prediction off LAST (last reference) and golden (which will
+ // generally be further behind/long-term reference).
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ // Check for update/refresh of reference: only refresh on base temporal
+ // layer.
+ if (svc->temporal_layer_id == 0) {
+ if (svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ // On key frame we update the buffer index used for long term reference.
+ // Use the alt_ref since it is not used or updated on key frames.
+ int index = svc->spatial_layer_id;
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ } else if (rc->frames_till_gf_update_due == 0) {
+ // Set perdiod of next update. Make it a multiple of 10, as the cyclic
+ // refresh is typically ~10%, and we'd like the update to happen after
+ // a few cylces of the refresh (so it better quality frame). Note the
+ // cyclic refresh for SVC only operates on base temporal layer frames.
+ // Choose 20 as perdiod for now (2 cycles).
+ rc->baseline_gf_interval = 20;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ cpi->ext_refresh_golden_frame = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+ }
+ } else if (!svc->use_gf_temporal_ref) {
+ rc->frames_till_gf_update_due = INT_MAX;
+ rc->baseline_gf_interval = INT_MAX;
+ }
+ if (svc->set_intra_only_frame) {
+ set_intra_only_frame(cpi);
+ if (cpi->oxcf.rc_mode == VPX_CBR)
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
+ }
+ // Overlay frame predicts from LAST (intra-only)
+ if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_update_parameters(cpi);
+
+ vp9_rc_set_frame_target(cpi, target);
+ if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
+
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
+ svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
+ svc->temporal_layer_id == 0) {
+ LAYER_CONTEXT *lc = NULL;
+ cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
+ if (cpi->resize_pending) {
+ int tl, width, height;
+ // Apply the same scale to all temporal layers.
+ for (tl = 0; tl < svc->number_temporal_layers; tl++) {
+ lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ tl];
+ lc->scaling_factor_num_resize =
+ cpi->resize_scale_num * lc->scaling_factor_num;
+ lc->scaling_factor_den_resize =
+ cpi->resize_scale_den * lc->scaling_factor_den;
+ // Reset rate control for all temporal layers.
+ lc->rc.buffer_level = lc->rc.optimal_buffer_level;
+ lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
+ lc->rc.rate_correction_factors[INTER_FRAME] =
+ rc->rate_correction_factors[INTER_FRAME];
+ }
+ // Set the size for this current temporal layer.
+ lc = &svc->layer_context[svc->spatial_layer_id *
+ svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
+ lc->scaling_factor_num_resize,
+ lc->scaling_factor_den_resize, &width, &height);
+ vp9_set_size_literal(cpi, width, height);
+ svc->resize_set = 1;
+ }
+ } else {
+ cpi->resize_pending = 0;
+ svc->resize_set = 0;
+ }
+}
+
+void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
+ cm->frame_type = KEY_FRAME;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ if (rc->frames_till_gf_update_due == 0) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_set_golden_update(cpi);
+ else
+ rc->baseline_gf_interval =
+ (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ vp9_cyclic_refresh_update_parameters(cpi);
+
+ if (frame_is_intra_only(cm))
+ target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+
+ vp9_rc_set_frame_target(cpi, target);
+
+ if (cm->show_frame) vp9_update_buffer_level_preencode(cpi);
+
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
+ cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
+ else
+ cpi->resize_pending = 0;
+}
+
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ vpx_bit_depth_t bit_depth) {
+ int start_index = rc->worst_quality;
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Convert the average q value to an index.
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ start_index = i;
+ if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart) break;
+ }
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ target_index = i;
+ if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+ }
+
+ return target_index - start_index;
+}
+
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ vpx_bit_depth_t bit_depth) {
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Look up the current projected bits per block for the base index
+ const int base_bits_per_mb =
+ vp9_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+ target_bits_per_mb) {
+ target_index = i;
+ break;
+ }
+ }
+ return target_index - qindex;
+}
+
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+ RATE_CONTROL *const rc) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Special case code for 1 pass fixed Q mode tests
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == VPX_Q)) {
+ rc->max_gf_interval = FIXED_GF_INTERVAL;
+ rc->min_gf_interval = FIXED_GF_INTERVAL;
+ rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ double framerate = cpi->framerate;
+ // Set Maximum gf/arf interval
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ rc->min_gf_interval = oxcf->min_gf_interval;
+#if CONFIG_RATE_CTRL
+ if (oxcf->use_simple_encode_api) {
+ // In this experiment, we avoid framerate being changed dynamically during
+ // encoding.
+ framerate = oxcf->init_framerate;
+ }
+#endif // CONFIG_RATE_CTRL
+ if (rc->min_gf_interval == 0) {
+ rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, framerate);
+ }
+ if (rc->max_gf_interval == 0) {
+ rc->max_gf_interval =
+ vp9_rc_get_default_max_gf_interval(framerate, rc->min_gf_interval);
+ }
+
+ // Extended max interval for genuinely static scenes like slide shows.
+ rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
+
+ if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+ rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+
+ if (oxcf->target_level == LEVEL_AUTO) {
+ const uint32_t pic_size = cpi->common.width * cpi->common.height;
+ const uint32_t pic_breadth =
+ VPXMAX(cpi->common.width, cpi->common.height);
+ int i;
+ for (i = 0; i < VP9_LEVELS; ++i) {
+ if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+ vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+ if (rc->min_gf_interval <=
+ (int)vp9_level_defs[i].min_altref_distance) {
+ rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
+ rc->max_gf_interval =
+ VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
+ }
+ break;
+ }
+ }
+ }
+ }
+}
+
+void vp9_rc_update_framerate(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int vbr_max_bits;
+
+ rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+ rc->min_frame_bandwidth =
+ (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+ rc->min_frame_bandwidth =
+ VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+ // A maximum bitrate for a frame is defined.
+ // However this limit is extended if a very high rate is given on the command
+ // line or the rate can not be achieved because of a user specified max q
+ // (e.g. when the user specifies lossless encode).
+ //
+ // If a level is specified that requires a lower maximum rate then the level
+ // value take precedence.
+ vbr_max_bits =
+ (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+ 100);
+ rc->max_frame_bandwidth =
+ VPXMAX(VPXMAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+ vp9_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+ int max_delta;
+ int frame_window = VPXMIN(16, ((int)cpi->twopass.total_stats.count -
+ cpi->common.current_video_frame));
+
+ // Calcluate the adjustment to rate for this frame.
+ if (frame_window > 0) {
+ max_delta = (vbr_bits_off_target > 0)
+ ? (int)(vbr_bits_off_target / frame_window)
+ : (int)(-vbr_bits_off_target / frame_window);
+
+ max_delta = VPXMIN(max_delta,
+ ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ if (vbr_bits_off_target > 0) {
+ *this_frame_target += (vbr_bits_off_target > max_delta)
+ ? max_delta
+ : (int)vbr_bits_off_target;
+ } else {
+ *this_frame_target -= (vbr_bits_off_target < -max_delta)
+ ? max_delta
+ : (int)-vbr_bits_off_target;
+ }
+ }
+
+ // Fast redistribution of bits arising from massive local undershoot.
+ // Dont do it for kf,arf,gf or overlay frames.
+ if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+ rc->vbr_bits_off_target_fast) {
+ int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
+ int fast_extra_bits;
+ fast_extra_bits = (int)VPXMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits = (int)VPXMIN(
+ fast_extra_bits,
+ VPXMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+ *this_frame_target += (int)fast_extra_bits;
+ rc->vbr_bits_off_target_fast -= fast_extra_bits;
+ }
+}
+
+void vp9_set_target_rate(VP9_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
+ else
+ target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+
+ if (!cpi->oxcf.vbr_corpus_complexity) {
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ }
+ vp9_rc_set_frame_target(cpi, target_rate);
+}
+
+// Check if we should resize, based on average QP from past x frames.
+// Only allow for resize at most one scale down for now, scaling factor is 2.
+int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ RESIZE_ACTION resize_action = NO_RESIZE;
+ int avg_qp_thr1 = 70;
+ int avg_qp_thr2 = 50;
+ // Don't allow for resized frame to go below 320x180, resize in steps of 3/4.
+ int min_width = (320 * 4) / 3;
+ int min_height = (180 * 4) / 3;
+ int down_size_on = 1;
+ int force_downsize_rate = 0;
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 1;
+ // Don't resize on key frame; reset the counters on key frame.
+ if (cm->frame_type == KEY_FRAME) {
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ return 0;
+ }
+
+ // No resizing down if frame size is below some limit.
+ if ((cm->width * cm->height) < min_width * min_height) down_size_on = 0;
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+ // If denoiser is on, apply a smaller qp threshold.
+ if (cpi->oxcf.noise_sensitivity > 0) {
+ avg_qp_thr1 = 60;
+ avg_qp_thr2 = 40;
+ }
+#endif
+
+ // Force downsize based on per-frame-bandwidth, for extreme case,
+ // for HD input.
+ if (cpi->resize_state == ORIG && cm->width * cm->height >= 1280 * 720) {
+ if (rc->avg_frame_bandwidth < 300000 / 30) {
+ resize_action = DOWN_ONEHALF;
+ cpi->resize_state = ONE_HALF;
+ force_downsize_rate = 1;
+ } else if (rc->avg_frame_bandwidth < 400000 / 30) {
+ resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+ cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+ force_downsize_rate = 1;
+ }
+ } else if (cpi->resize_state == THREE_QUARTER &&
+ cm->width * cm->height >= 960 * 540) {
+ if (rc->avg_frame_bandwidth < 300000 / 30) {
+ resize_action = DOWN_ONEHALF;
+ cpi->resize_state = ONE_HALF;
+ force_downsize_rate = 1;
+ }
+ }
+
+ // Resize based on average buffer underflow and QP over some window.
+ // Ignore samples close to key frame, since QP is usually high after key.
+ if (!force_downsize_rate && cpi->rc.frames_since_key > cpi->framerate) {
+ const int window = VPXMIN(30, (int)(2 * cpi->framerate));
+ cpi->resize_avg_qp += rc->last_q[INTER_FRAME];
+ if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+ ++cpi->resize_buffer_underflow;
+ ++cpi->resize_count;
+ // Check for resize action every "window" frames.
+ if (cpi->resize_count >= window) {
+ int avg_qp = cpi->resize_avg_qp / cpi->resize_count;
+ // Resize down if buffer level has underflowed sufficient amount in past
+ // window, and we are at original or 3/4 of original resolution.
+ // Resize back up if average QP is low, and we are currently in a resized
+ // down state, i.e. 1/2 or 3/4 of original resolution.
+ // Currently, use a flag to turn 3/4 resizing feature on/off.
+ if (cpi->resize_buffer_underflow > (cpi->resize_count >> 2) &&
+ down_size_on) {
+ if (cpi->resize_state == THREE_QUARTER) {
+ resize_action = DOWN_ONEHALF;
+ cpi->resize_state = ONE_HALF;
+ } else if (cpi->resize_state == ORIG) {
+ resize_action = ONEHALFONLY_RESIZE ? DOWN_ONEHALF : DOWN_THREEFOUR;
+ cpi->resize_state = ONEHALFONLY_RESIZE ? ONE_HALF : THREE_QUARTER;
+ }
+ } else if (cpi->resize_state != ORIG &&
+ avg_qp < avg_qp_thr1 * cpi->rc.worst_quality / 100) {
+ if (cpi->resize_state == THREE_QUARTER ||
+ avg_qp < avg_qp_thr2 * cpi->rc.worst_quality / 100 ||
+ ONEHALFONLY_RESIZE) {
+ resize_action = UP_ORIG;
+ cpi->resize_state = ORIG;
+ } else if (cpi->resize_state == ONE_HALF) {
+ resize_action = UP_THREEFOUR;
+ cpi->resize_state = THREE_QUARTER;
+ }
+ }
+ // Reset for next window measurement.
+ cpi->resize_avg_qp = 0;
+ cpi->resize_count = 0;
+ cpi->resize_buffer_underflow = 0;
+ }
+ }
+ // If decision is to resize, reset some quantities, and check is we should
+ // reduce rate correction factor,
+ if (resize_action != NO_RESIZE) {
+ int target_bits_per_frame;
+ int active_worst_quality;
+ int qindex;
+ int tot_scale_change;
+ if (resize_action == DOWN_THREEFOUR || resize_action == UP_THREEFOUR) {
+ cpi->resize_scale_num = 3;
+ cpi->resize_scale_den = 4;
+ } else if (resize_action == DOWN_ONEHALF) {
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 2;
+ } else { // UP_ORIG or anything else
+ cpi->resize_scale_num = 1;
+ cpi->resize_scale_den = 1;
+ }
+ tot_scale_change = (cpi->resize_scale_den * cpi->resize_scale_den) /
+ (cpi->resize_scale_num * cpi->resize_scale_num);
+ // Reset buffer level to optimal, update target size.
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->this_frame_target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+ // Get the projected qindex, based on the scaled target frame size (scaled
+ // so target_bits_per_mb in vp9_rc_regulate_q will be correct target).
+ target_bits_per_frame = (resize_action >= 0)
+ ? rc->this_frame_target * tot_scale_change
+ : rc->this_frame_target / tot_scale_change;
+ active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ qindex = vp9_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
+ active_worst_quality);
+ // If resize is down, check if projected q index is close to worst_quality,
+ // and if so, reduce the rate correction factor (since likely can afford
+ // lower q for resized frame).
+ if (resize_action > 0 && qindex > 90 * cpi->rc.worst_quality / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+ }
+ // If resize is back up, check if projected q index is too much above the
+ // current base_qindex, and if so, reduce the rate correction factor
+ // (since prefer to keep q for resized frame at least close to previous q).
+ if (resize_action < 0 && qindex > 130 * cm->base_qindex / 100) {
+ rc->rate_correction_factors[INTER_NORMAL] *= 0.9;
+ }
+ }
+ return resize_action;
+}
+
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+ uint64_t avg_sad_current) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ int found = 0;
+ int found2 = 0;
+ int frame;
+ int i;
+ uint64_t avg_source_sad_lag = avg_sad_current;
+ int high_source_sad_lagindex = -1;
+ int steady_sad_lagindex = -1;
+ uint32_t sad_thresh1 = 70000;
+ uint32_t sad_thresh2 = 120000;
+ int low_content = 0;
+ int high_content = 0;
+ double rate_err = 1.0;
+ // Get measure of complexity over the future frames, and get the first
+ // future frame with high_source_sad/scene-change.
+ int tot_frames = (int)vp9_lookahead_depth(cpi->lookahead) - 1;
+ for (frame = tot_frames; frame >= 1; --frame) {
+ const int lagframe_idx = tot_frames - frame + 1;
+ uint64_t reference_sad = rc->avg_source_sad[0];
+ for (i = 1; i < lagframe_idx; ++i) {
+ if (rc->avg_source_sad[i] > 0)
+ reference_sad = (3 * reference_sad + rc->avg_source_sad[i]) >> 2;
+ }
+ // Detect up-coming scene change.
+ if (!found &&
+ (rc->avg_source_sad[lagframe_idx] >
+ VPXMAX(sad_thresh1, (unsigned int)(reference_sad << 1)) ||
+ rc->avg_source_sad[lagframe_idx] >
+ VPXMAX(3 * sad_thresh1 >> 2,
+ (unsigned int)(reference_sad << 2)))) {
+ high_source_sad_lagindex = lagframe_idx;
+ found = 1;
+ }
+ // Detect change from motion to steady.
+ if (!found2 && lagframe_idx > 1 && lagframe_idx < tot_frames &&
+ rc->avg_source_sad[lagframe_idx - 1] > (sad_thresh1 >> 2)) {
+ found2 = 1;
+ for (i = lagframe_idx; i < tot_frames; ++i) {
+ if (!(rc->avg_source_sad[i] > 0 &&
+ rc->avg_source_sad[i] < (sad_thresh1 >> 2) &&
+ rc->avg_source_sad[i] <
+ (rc->avg_source_sad[lagframe_idx - 1] >> 1))) {
+ found2 = 0;
+ i = tot_frames;
+ }
+ }
+ if (found2) steady_sad_lagindex = lagframe_idx;
+ }
+ avg_source_sad_lag += rc->avg_source_sad[lagframe_idx];
+ }
+ if (tot_frames > 0) avg_source_sad_lag = avg_source_sad_lag / tot_frames;
+ // Constrain distance between detected scene cuts.
+ if (high_source_sad_lagindex != -1 &&
+ high_source_sad_lagindex != rc->high_source_sad_lagindex - 1 &&
+ abs(high_source_sad_lagindex - rc->high_source_sad_lagindex) < 4)
+ rc->high_source_sad_lagindex = -1;
+ else
+ rc->high_source_sad_lagindex = high_source_sad_lagindex;
+ // Adjust some factors for the next GF group, ignore initial key frame,
+ // and only for lag_in_frames not too small.
+ if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 &&
+ cpi->oxcf.lag_in_frames > 8) {
+ int frame_constraint;
+ if (rc->rolling_target_bits > 0)
+ rate_err =
+ (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+ high_content = high_source_sad_lagindex != -1 ||
+ avg_source_sad_lag > (rc->prev_avg_source_sad_lag << 1) ||
+ avg_source_sad_lag > sad_thresh2;
+ low_content = high_source_sad_lagindex == -1 &&
+ ((avg_source_sad_lag < (rc->prev_avg_source_sad_lag >> 1)) ||
+ (avg_source_sad_lag < sad_thresh1));
+ if (low_content) {
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ rc->baseline_gf_interval =
+ VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1);
+ } else if (high_content) {
+ rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+ rc->baseline_gf_interval = (rate_err > 3.0)
+ ? VPXMAX(10, rc->baseline_gf_interval >> 1)
+ : VPXMAX(6, rc->baseline_gf_interval >> 1);
+ }
+ if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1)
+ rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1;
+ // Check for constraining gf_interval for up-coming scene/content changes,
+ // or for up-coming key frame, whichever is closer.
+ frame_constraint = rc->frames_to_key;
+ if (rc->high_source_sad_lagindex > 0 &&
+ frame_constraint > rc->high_source_sad_lagindex)
+ frame_constraint = rc->high_source_sad_lagindex;
+ if (steady_sad_lagindex > 3 && frame_constraint > steady_sad_lagindex)
+ frame_constraint = steady_sad_lagindex;
+ adjust_gfint_frame_constraint(cpi, frame_constraint);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // Adjust factors for active_worst setting & af_ratio for next gf interval.
+ rc->fac_active_worst_inter = 150; // corresponds to 3/2 (= 150 /100).
+ rc->fac_active_worst_gf = 100;
+ if (rate_err < 2.0 && !high_content) {
+ rc->fac_active_worst_inter = 120;
+ rc->fac_active_worst_gf = 90;
+ } else if (rate_err > 8.0 && rc->avg_frame_qindex[INTER_FRAME] < 16) {
+ // Increase active_worst faster at low Q if rate fluctuation is high.
+ rc->fac_active_worst_inter = 200;
+ if (rc->avg_frame_qindex[INTER_FRAME] < 8)
+ rc->fac_active_worst_inter = 400;
+ }
+ if (low_content && rc->avg_frame_low_motion > 80) {
+ rc->af_ratio_onepass_vbr = 15;
+ } else if (high_content || rc->avg_frame_low_motion < 30) {
+ rc->af_ratio_onepass_vbr = 5;
+ rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
+ }
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+ // Flag to disable usage of ARF based on past usage, only allow this
+ // disabling if current frame/group does not start with key frame or
+ // scene cut. Note perc_arf_usage is only computed for speed >= 5.
+ int arf_usage_low =
+ (cm->frame_type != KEY_FRAME && !rc->high_source_sad &&
+ cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5);
+ // Don't use alt-ref for this group under certain conditions.
+ if (arf_usage_low ||
+ (rc->high_source_sad_lagindex > 0 &&
+ rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
+ (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
+ rc->source_alt_ref_pending = 0;
+ rc->alt_ref_gf_group = 0;
+ } else {
+ rc->source_alt_ref_pending = 1;
+ rc->alt_ref_gf_group = 1;
+ // If alt-ref is used for this gf group, limit the interval.
+ if (rc->baseline_gf_interval > 12) {
+ rc->baseline_gf_interval = 12;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ }
+ }
+ }
+ target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+ vp9_rc_set_frame_target(cpi, target);
+ }
+ rc->prev_avg_source_sad_lag = avg_source_sad_lag;
+}
+
+// Compute average source sad (temporal sad: between current source and
+// previous source) over a subset of superblocks. Use this is detect big changes
+// in content and allow rate control to react.
+// This function also handles special case of lag_in_frames, to measure content
+// level in #future frames set by the lag_in_frames.
+void vp9_scene_detection_onepass(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source;
+ YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source;
+ uint8_t *src_y;
+ int src_ystride;
+ int src_width;
+ int src_height;
+ uint8_t *last_src_y;
+ int last_src_ystride;
+ int last_src_width;
+ int last_src_height;
+ if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL ||
+ (cpi->use_svc && cpi->svc.current_superframe == 0))
+ return;
+ src_y = unscaled_src->y_buffer;
+ src_ystride = unscaled_src->y_stride;
+ src_width = unscaled_src->y_width;
+ src_height = unscaled_src->y_height;
+ last_src_y = unscaled_last_src->y_buffer;
+ last_src_ystride = unscaled_last_src->y_stride;
+ last_src_width = unscaled_last_src->y_width;
+ last_src_height = unscaled_last_src->y_height;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cm->use_highbitdepth) return;
+#endif
+ rc->high_source_sad = 0;
+ rc->high_num_blocks_with_motion = 0;
+ // For SVC: scene detection is only checked on first spatial layer of
+ // the superframe using the original/unscaled resolutions.
+ if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+ src_width == last_src_width && src_height == last_src_height) {
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+ int num_mi_cols = cm->mi_cols;
+ int num_mi_rows = cm->mi_rows;
+ int start_frame = 0;
+ int frames_to_buffer = 1;
+ int frame = 0;
+ int scene_cut_force_key_frame = 0;
+ int num_zero_temp_sad = 0;
+ uint64_t avg_sad_current = 0;
+ uint32_t min_thresh = 20000; // ~5 * 64 * 64
+ float thresh = 8.0f;
+ uint32_t thresh_key = 140000;
+ if (cpi->oxcf.speed <= 5) thresh_key = 240000;
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000;
+ if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f;
+ if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2);
+ const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2);
+ num_mi_cols = aligned_width >> MI_SIZE_LOG2;
+ num_mi_rows = aligned_height >> MI_SIZE_LOG2;
+ }
+ if (cpi->oxcf.lag_in_frames > 0) {
+ frames_to_buffer = (cm->current_video_frame == 1)
+ ? (int)vp9_lookahead_depth(cpi->lookahead) - 1
+ : 2;
+ start_frame = (int)vp9_lookahead_depth(cpi->lookahead) - 1;
+ for (frame = 0; frame < frames_to_buffer; ++frame) {
+ const int lagframe_idx = start_frame - frame;
+ if (lagframe_idx >= 0) {
+ struct lookahead_entry *buf =
+ vp9_lookahead_peek(cpi->lookahead, lagframe_idx);
+ frames[frame] = &buf->img;
+ }
+ }
+ // The avg_sad for this current frame is the value of frame#1
+ // (first future frame) from previous frame.
+ avg_sad_current = rc->avg_source_sad[1];
+ if (avg_sad_current >
+ VPXMAX(min_thresh,
+ (unsigned int)(rc->avg_source_sad[0] * thresh)) &&
+ cm->current_video_frame > (unsigned int)cpi->oxcf.lag_in_frames)
+ rc->high_source_sad = 1;
+ else
+ rc->high_source_sad = 0;
+ if (rc->high_source_sad && avg_sad_current > thresh_key)
+ scene_cut_force_key_frame = 1;
+ // Update recursive average for current frame.
+ if (avg_sad_current > 0)
+ rc->avg_source_sad[0] =
+ (3 * rc->avg_source_sad[0] + avg_sad_current) >> 2;
+ // Shift back data, starting at frame#1.
+ for (frame = 1; frame < cpi->oxcf.lag_in_frames - 1; ++frame)
+ rc->avg_source_sad[frame] = rc->avg_source_sad[frame + 1];
+ }
+ for (frame = 0; frame < frames_to_buffer; ++frame) {
+ if (cpi->oxcf.lag_in_frames == 0 ||
+ (frames[frame] != NULL && frames[frame + 1] != NULL &&
+ frames[frame]->y_width == frames[frame + 1]->y_width &&
+ frames[frame]->y_height == frames[frame + 1]->y_height)) {
+ int sbi_row, sbi_col;
+ const int lagframe_idx =
+ (cpi->oxcf.lag_in_frames == 0) ? 0 : start_frame - frame + 1;
+ const BLOCK_SIZE bsize = BLOCK_64X64;
+ // Loop over sub-sample of frame, compute average sad over 64x64 blocks.
+ uint64_t avg_sad = 0;
+ uint64_t tmp_sad = 0;
+ int num_samples = 0;
+ int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+ if (cpi->oxcf.lag_in_frames > 0) {
+ src_y = frames[frame]->y_buffer;
+ src_ystride = frames[frame]->y_stride;
+ last_src_y = frames[frame + 1]->y_buffer;
+ last_src_ystride = frames[frame + 1]->y_stride;
+ }
+ num_zero_temp_sad = 0;
+ for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
+ for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
+ // Checker-board pattern, ignore boundary.
+ if (((sbi_row > 0 && sbi_col > 0) &&
+ (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
+ ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
+ (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
+ tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+ last_src_ystride);
+ avg_sad += tmp_sad;
+ num_samples++;
+ if (tmp_sad == 0) num_zero_temp_sad++;
+ }
+ src_y += 64;
+ last_src_y += 64;
+ }
+ src_y += (src_ystride << 6) - (sb_cols << 6);
+ last_src_y += (last_src_ystride << 6) - (sb_cols << 6);
+ }
+ if (num_samples > 0) avg_sad = avg_sad / num_samples;
+ // Set high_source_sad flag if we detect very high increase in avg_sad
+ // between current and previous frame value(s). Use minimum threshold
+ // for cases where there is small change from content that is completely
+ // static.
+ if (lagframe_idx == 0) {
+ if (avg_sad >
+ VPXMAX(min_thresh,
+ (unsigned int)(rc->avg_source_sad[0] * thresh)) &&
+ rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+ num_zero_temp_sad < 3 * (num_samples >> 2))
+ rc->high_source_sad = 1;
+ else
+ rc->high_source_sad = 0;
+ if (rc->high_source_sad && avg_sad > thresh_key)
+ scene_cut_force_key_frame = 1;
+ if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
+ rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
+ } else {
+ rc->avg_source_sad[lagframe_idx] = avg_sad;
+ }
+ if (num_zero_temp_sad < (3 * num_samples >> 2))
+ rc->high_num_blocks_with_motion = 1;
+ }
+ }
+ // For CBR non-screen content mode, check if we should reset the rate
+ // control. Reset is done if high_source_sad is detected and the rate
+ // control is at very low QP with rate correction factor at min level.
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+ if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+ rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+ rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+ rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+ rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->reset_high_source_sad = 1;
+ }
+ if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+ rc->this_frame_target = rc->avg_frame_bandwidth;
+ }
+ // For SVC the new (updated) avg_source_sad[0] for the current superframe
+ // updates the setting for all layers.
+ if (cpi->use_svc) {
+ int sl, tl;
+ SVC *const svc = &cpi->svc;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl)
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->avg_source_sad[0] = rc->avg_source_sad[0];
+ }
+ }
+ // For VBR, under scene change/high content change, force golden refresh.
+ if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
+ rc->high_source_sad && rc->frames_to_key > 3 &&
+ rc->count_last_scene_change > 4 &&
+ cpi->ext_refresh_frame_flags_pending == 0) {
+ int target;
+ cpi->refresh_golden_frame = 1;
+ if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
+ rc->source_alt_ref_pending = 0;
+ if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
+ rc->source_alt_ref_pending = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
+ rc->baseline_gf_interval =
+ VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
+ adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+ vp9_rc_set_frame_target(cpi, target);
+ rc->count_last_scene_change = 0;
+ } else {
+ rc->count_last_scene_change++;
+ }
+ // If lag_in_frame is used, set the gf boost and interval.
+ if (cpi->oxcf.lag_in_frames > 0)
+ adjust_gf_boost_lag_one_pass_vbr(cpi, avg_sad_current);
+ }
+}
+
+// Test if encoded frame will significantly overshoot the target bitrate, and
+// if so, set the QP, reset/adjust some rate control parameters, and return 1.
+// frame_size = -1 means frame has not been encoded.
+int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ int thresh_qp = 7 * (rc->worst_quality >> 3);
+ int thresh_rate = rc->avg_frame_bandwidth << 3;
+ // Lower thresh_qp for video (more overshoot at lower Q) to be
+ // more conservative for video.
+ if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+ thresh_qp = 3 * (rc->worst_quality >> 2);
+ // If this decision is not based on an encoded frame size but just on
+ // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt ==
+ // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate)
+ // condition in this case.
+ // TODO(marpan): Use a better size/rate condition for this case and
+ // adjust thresholds.
+ if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ ||
+ frame_size > thresh_rate) &&
+ cm->base_qindex < thresh_qp) {
+ double rate_correction_factor =
+ cpi->rc.rate_correction_factors[INTER_NORMAL];
+ const int target_size = cpi->rc.avg_frame_bandwidth;
+ double new_correction_factor;
+ int target_bits_per_mb;
+ double q2;
+ int enumerator;
+ // Force a re-encode, and for now use max-QP.
+ *q = cpi->rc.worst_quality;
+ cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+ cpi->rc.re_encode_maxq_scene_change = 1;
+ // If the frame_size is much larger than the threshold (big content change)
+ // and the encoded frame used alot of Intra modes, then force hybrid_intra
+ // encoding for the re-encode on this scene change. hybrid_intra will
+ // use rd-based intra mode selection for small blocks.
+ if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+ frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) {
+ MODE_INFO **mi = cm->mi_grid_visible;
+ int sum_intra_usage = 0;
+ int mi_row, mi_col;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
+ mi++;
+ }
+ mi += 8;
+ }
+ sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols);
+ if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1;
+ }
+ // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
+ // these parameters will affect QP selection for subsequent frames. If they
+ // have settled down to a very different (low QP) state, then not adjusting
+ // them may cause next frame to select low QP and overshoot again.
+ cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ // Reset rate under/over-shoot flags.
+ cpi->rc.rc_1_frame = 0;
+ cpi->rc.rc_2_frame = 0;
+ // Adjust rate correction factor.
+ target_bits_per_mb =
+ (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->MBs);
+ // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
+ // This comes from the inverse computation of vp9_rc_bits_per_mb().
+ q2 = vp9_convert_qindex_to_q(*q, cm->bit_depth);
+ enumerator = 1800000; // Factor for inter frame.
+ enumerator += (int)(enumerator * q2) >> 12;
+ new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
+ if (new_correction_factor > rate_correction_factor) {
+ rate_correction_factor =
+ VPXMIN(2.0 * rate_correction_factor, new_correction_factor);
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+ }
+ // For temporal layers, reset the rate control parametes across all
+ // temporal layers. If the first_spatial_layer_to_encode > 0, then this
+ // superframe has skipped lower base layers. So in this case we should also
+ // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
+ if (cpi->use_svc) {
+ int tl = 0;
+ int sl = 0;
+ SVC *svc = &cpi->svc;
+ for (sl = 0; sl < VPXMAX(1, svc->first_spatial_layer_to_encode); ++sl) {
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ const int layer =
+ LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->avg_frame_qindex[INTER_FRAME] = *q;
+ lrc->buffer_level = lrc->optimal_buffer_level;
+ lrc->bits_off_target = lrc->optimal_buffer_level;
+ lrc->rc_1_frame = 0;
+ lrc->rc_2_frame = 0;
+ lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+ lrc->force_max_q = 1;
+ }
+ }
+ }
+ return 1;
+ } else {
+ return 0;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
new file mode 100644
index 0000000000..96a8fd3f1d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_RATECTRL_H_
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_lookahead.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Used to control aggressive VBR mode.
+// #define AGGRESSIVE_VBR 1
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
+#define ONEHALFONLY_RESIZE 0
+
+#define FRAME_OVERHEAD_BITS 200
+
+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
+typedef enum {
+ INTER_NORMAL = 0,
+ INTER_HIGH = 1,
+ GF_ARF_LOW = 2,
+ GF_ARF_STD = 3,
+ KF_STD = 4,
+ RATE_FACTOR_LEVELS = 5
+} RATE_FACTOR_LEVEL;
+
+// Internal frame scaling level.
+typedef enum {
+ UNSCALED = 0, // Frame is unscaled.
+ SCALE_STEP1 = 1, // First-level down-scaling.
+ FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+typedef enum {
+ NO_RESIZE = 0,
+ DOWN_THREEFOUR = 1, // From orig to 3/4.
+ DOWN_ONEHALF = 2, // From orig or 3/4 to 1/2.
+ UP_THREEFOUR = -1, // From 1/2 to 3/4.
+ UP_ORIG = -2, // From 1/2 or 3/4 to orig.
+} RESIZE_ACTION;
+
+typedef enum { ORIG = 0, THREE_QUARTER = 1, ONE_HALF = 2 } RESIZE_STATE;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = { 16, 24 };
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = { 1.0, 2.0 };
+
+typedef struct {
+ // Rate targeting variables
+ int base_frame_target; // A baseline frame target before adjustment
+ // for previous under or over shoot.
+ int this_frame_target; // Actual frame target after rc adjustment.
+ int projected_frame_size;
+ int sb64_target_rate;
+ int last_q[FRAME_TYPES]; // Separate values for Intra/Inter
+ int last_boosted_qindex; // Last boosted GF/KF/ARF q
+ int last_kf_qindex; // Q index of the last key frame coded.
+
+ int gfu_boost;
+ int last_boost;
+ int kf_boost;
+
+ double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ int frames_since_golden;
+ int frames_till_gf_update_due;
+ int min_gf_interval;
+ int max_gf_interval;
+ int static_scene_max_gf_interval;
+ int baseline_gf_interval;
+ int constrained_gf_group;
+ int frames_to_key;
+ int frames_since_key;
+ int this_key_frame_forced;
+ int next_key_frame_forced;
+ int source_alt_ref_pending;
+ int source_alt_ref_active;
+ int is_src_frame_alt_ref;
+
+ int avg_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex[FRAME_TYPES];
+ double tot_q;
+ double avg_q;
+
+ int64_t buffer_level;
+ int64_t bits_off_target;
+ int64_t vbr_bits_off_target;
+ int64_t vbr_bits_off_target_fast;
+
+ int decimation_factor;
+ int decimation_count;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ int rate_error_estimate;
+
+ int64_t total_actual_bits;
+ int64_t total_target_bits;
+ int64_t total_target_vs_actual;
+
+ int worst_quality;
+ int best_quality;
+
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: undershot
+ // 1: overshoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+ // Keep track of the last target average frame bandwidth.
+ int last_avg_frame_bandwidth;
+
+ // Auto frame-scaling variables.
+ FRAME_SCALE_LEVEL frame_size_selector;
+ FRAME_SCALE_LEVEL next_frame_size_selector;
+ int frame_width[FRAME_SCALE_STEPS];
+ int frame_height[FRAME_SCALE_STEPS];
+ int rf_level_maxq[RATE_FACTOR_LEVELS];
+
+ int fac_active_worst_inter;
+ int fac_active_worst_gf;
+ uint64_t avg_source_sad[MAX_LAG_BUFFERS];
+ uint64_t prev_avg_source_sad_lag;
+ int high_source_sad_lagindex;
+ int high_num_blocks_with_motion;
+ int alt_ref_gf_group;
+ int last_frame_is_src_altref;
+ int high_source_sad;
+ int count_last_scene_change;
+ int hybrid_intra_scene_change;
+ int re_encode_maxq_scene_change;
+ int avg_frame_low_motion;
+ int af_ratio_onepass_vbr;
+ int force_qpmin;
+ int reset_high_source_sad;
+ double perc_arf_usage;
+ int force_max_q;
+ // Last frame was dropped post encode on scene change.
+ int last_post_encode_dropped_scene_change;
+ // Enable post encode frame dropping for screen content. Only enabled when
+ // ext_use_post_encode_drop is enabled by user.
+ int use_post_encode_drop;
+ // External flag to enable post encode frame dropping, controlled by user.
+ int ext_use_post_encode_drop;
+ // Flag to disable CBR feature to increase Q on overshoot detection.
+ int disable_overshoot_maxq_cbr;
+ int damped_adjustment[RATE_FACTOR_LEVELS];
+ double arf_active_best_quality_adjustment_factor;
+ int arf_increase_active_best_quality;
+
+ int preserve_arf_as_gld;
+ int preserve_next_arf_as_gld;
+ int show_arf_as_gld;
+
+ // Flag to constrain golden frame interval on key frame frequency for 1 pass
+ // VBR.
+ int constrain_gf_key_freq_onepass_vbr;
+
+ // The index of the current GOP. Start from zero.
+ // When a key frame is inserted, it resets to zero.
+ int gop_global_index;
+} RATE_CONTROL;
+
+struct VP9_COMP;
+struct VP9EncoderConfig;
+
+void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
+ RATE_CONTROL *rc);
+
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+ double correction_factor, vpx_bit_depth_t bit_depth);
+
+double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
+
+int vp9_convert_q_to_qindex(double q_val, vpx_bit_depth_t bit_depth);
+
+void vp9_rc_init_minq_luts(void);
+
+int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as big
+// as that.
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+// vp9_rc_get_one_pass_vbr_params()
+// vp9_rc_get_one_pass_cbr_params()
+// vp9_rc_get_svc_params()
+// vp9_rc_get_first_pass_params()
+// vp9_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+// vp9_rc_postencode_update()
+// vp9_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the vp9_rc_get_..._params() functions and
+// updated during the vp9_rc_postencode_update...() functions.
+// The only exceptions are vp9_rc_drop_frame() and
+// vp9_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void vp9_rc_get_one_pass_vbr_params(struct VP9_COMP *cpi);
+void vp9_rc_get_one_pass_cbr_params(struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_cbr(const struct VP9_COMP *cpi);
+int vp9_calc_pframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+int vp9_calc_iframe_target_size_one_pass_vbr(const struct VP9_COMP *cpi);
+void vp9_set_gf_update_one_pass_vbr(struct VP9_COMP *const cpi);
+void vp9_update_buffer_level_preencode(struct VP9_COMP *cpi);
+void vp9_rc_get_svc_params(struct VP9_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void vp9_rc_postencode_update(struct VP9_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
+
+// Post encode drop for CBR mode.
+int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size);
+
+int vp9_test_drop(struct VP9_COMP *cpi);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int vp9_rc_drop_frame(struct VP9_COMP *cpi);
+
+// Computes frame size bounds.
+void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
+ int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_bounds(const struct VP9_COMP *cpi, int *bottom_index,
+ int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, vpx_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi,
+ int target);
+int vp9_rc_clamp_pframe_target_size(const struct VP9_COMP *const cpi,
+ int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the vp9_rc_get_..._params() functions.
+void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ vpx_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ vpx_bit_depth_t bit_depth);
+
+int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q);
+
+void vp9_rc_update_framerate(struct VP9_COMP *cpi);
+
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void vp9_set_target_rate(struct VP9_COMP *cpi);
+
+int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi);
+
+void vp9_scene_detection_onepass(struct VP9_COMP *cpi);
+
+int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
+
+void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
+
+void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
+
+void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.c b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c
new file mode 100644
index 0000000000..95c95971c5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.c
@@ -0,0 +1,795 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/bitops.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+void vp9_rd_cost_reset(RD_COST *rd_cost) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->dist = INT64_MAX;
+ rd_cost->rdcost = INT64_MAX;
+}
+
+void vp9_rd_cost_init(RD_COST *rd_cost) {
+ rd_cost->rate = 0;
+ rd_cost->dist = 0;
+ rd_cost->rdcost = 0;
+}
+
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) {
+ assert(mult >= 0);
+ assert(div > 0);
+ if (rate >= 0 && dist >= 0) {
+ return RDCOST(mult, div, rate, dist);
+ }
+ if (rate >= 0 && dist < 0) {
+ return RDCOST_NEG_D(mult, div, rate, -dist);
+ }
+ if (rate < 0 && dist >= 0) {
+ return RDCOST_NEG_R(mult, div, -rate, dist);
+ }
+ return -RDCOST(mult, div, -rate, -dist);
+}
+
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) {
+ if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) {
+ rd_cost->rdcost =
+ vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist);
+ } else {
+ vp9_rd_cost_reset(rd_cost);
+ }
+}
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
+ 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+};
+
+static void fill_mode_costs(VP9_COMP *cpi) {
+ const FRAME_CONTEXT *const fc = cpi->common.fc;
+ int i, j;
+
+ for (i = 0; i < INTRA_MODES; ++i) {
+ for (j = 0; j < INTRA_MODES; ++j) {
+ vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+ vp9_intra_mode_tree);
+ }
+ }
+
+ vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
+ for (i = 0; i < INTRA_MODES; ++i) {
+ vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
+ vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
+ vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
+ fc->uv_mode_prob[i], vp9_intra_mode_tree);
+ }
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
+ vp9_cost_tokens(cpi->switchable_interp_costs[i],
+ fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+ }
+
+ for (i = TX_8X8; i < TX_SIZES; ++i) {
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
+ const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
+ int k;
+ for (k = 0; k <= i; ++k) {
+ int cost = 0;
+ int m;
+ for (m = 0; m <= k - (k == i); ++m) {
+ if (m == k)
+ cost += vp9_cost_zero(tx_probs[m]);
+ else
+ cost += vp9_cost_one(tx_probs[m]);
+ }
+ cpi->tx_size_cost[i - 1][j][k] = cost;
+ }
+ }
+ }
+}
+
+static void fill_token_costs(vp9_coeff_cost *c,
+ vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
+ int i, j, k, l;
+ TX_SIZE t;
+ for (t = TX_4X4; t <= TX_32X32; ++t)
+ for (i = 0; i < PLANE_TYPES; ++i)
+ for (j = 0; j < REF_TYPES; ++j)
+ for (k = 0; k < COEF_BANDS; ++k)
+ for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+ vpx_prob probs[ENTROPY_NODES];
+ vp9_model_to_full_probs(p[t][i][j][k][l], probs);
+ vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, vp9_coef_tree);
+ vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
+ vp9_coef_tree);
+ assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+ c[t][i][j][k][1][l][EOB_TOKEN]);
+ }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+#endif
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+ vpx_bit_depth_t bit_depth) {
+ int i;
+ // Initialize the sad lut tables using a formulaic calculation for now.
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < range; i++) {
+ const double q = vp9_convert_qindex_to_q(i, bit_depth);
+ bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ bit4lut[i] = (int)(0.063 * q + 2.742);
+ }
+}
+
+void vp9_init_me_luts(void) {
+ init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+ VPX_BITS_8);
+#if CONFIG_VP9_HIGHBITDEPTH
+ init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+ VPX_BITS_10);
+ init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+ VPX_BITS_12);
+#endif
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+ 8, 8, 4, 4, 2, 2, 1, 0 };
+
+// Note that the element below for frame type "USE_BUF_FRAME", which indicates
+// that the show frame flag is set, should not be used as no real frame
+// is encoded so we should not reach here. However, a dummy value
+// is inserted here to make sure the data structure has the right number
+// of values assigned.
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
+ 128, 144, 144 };
+
+// Configure Vizier RD parameters.
+// Later this function will use passed in command line values.
+void vp9_init_rd_parameters(VP9_COMP *cpi) {
+ RD_CONTROL *const rdc = &cpi->rd_ctrl;
+
+ // When |use_vizier_rc_params| is 1, we expect the rd parameters have been
+ // initialized by the pass in values.
+ // Be careful that parameters below are only initialized to 1, if we do not
+ // pass values to them. It is desired to take care of each parameter when
+ // using |use_vizier_rc_params|.
+ if (cpi->twopass.use_vizier_rc_params) return;
+
+ // Make sure this function is floating point safe.
+ vpx_clear_system_state();
+
+ rdc->rd_mult_inter_qp_fac = 1.0;
+ rdc->rd_mult_arf_qp_fac = 1.0;
+ rdc->rd_mult_key_qp_fac = 1.0;
+}
+
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+ return 4.15 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+ return 4.25 + (0.001 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+ return 4.35 + (0.001 * (double)qindex);
+}
+
+int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
+ const RD_CONTROL *rdc = &cpi->rd_ctrl;
+ const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+ // largest dc_quant is 21387, therefore rdmult should fit in int32_t
+ int rdmult = q * q;
+
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+ cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+ return cpi->ext_ratectrl.ext_rdmult;
+ }
+
+ // Make sure this function is floating point safe.
+ vpx_clear_system_state();
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+ rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_key_qp_fac);
+ } else if (!cpi->rc.is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+ rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_arf_qp_fac);
+ } else {
+ double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+ rdmult = (int)((double)rdmult * def_rd_q_mult * rdc->rd_mult_inter_qp_fac);
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (cpi->common.bit_depth) {
+ case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+ case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+ default: break;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ return rdmult > 0 ? rdmult : 1;
+}
+
+static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
+ int64_t rdmult_64 = rdmult;
+ if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+ const int gfu_boost = cpi->multi_layer_arf
+ ? gf_group->gfu_boost[gf_group->index]
+ : cpi->rc.gfu_boost;
+ const int boost_index = VPXMIN(15, (gfu_boost / 100));
+
+ rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7;
+ rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7);
+ }
+ return (int)rdmult_64;
+}
+
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+ int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+ if (cpi->ext_ratectrl.ready &&
+ (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+ cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+ return cpi->ext_ratectrl.ext_rdmult;
+ }
+ return modulate_rdmult(cpi, rdmult);
+}
+
+int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) {
+ int rdmult =
+ vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex);
+ rdmult = (int)((double)rdmult / beta);
+ rdmult = rdmult > 0 ? rdmult : 1;
+ return modulate_rdmult(cpi, rdmult);
+}
+
+static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
+ double q;
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (bit_depth) {
+ case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
+ case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
+ default:
+ assert(bit_depth == VPX_BITS_12);
+ q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+ break;
+ }
+#else
+ (void)bit_depth;
+ q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ // TODO(debargha): Adjust the function below.
+ return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ switch (cpi->common.bit_depth) {
+ case VPX_BITS_8:
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
+ break;
+ case VPX_BITS_10:
+ x->sadperbit16 = sad_per_bit16lut_10[qindex];
+ x->sadperbit4 = sad_per_bit4lut_10[qindex];
+ break;
+ default:
+ assert(cpi->common.bit_depth == VPX_BITS_12);
+ x->sadperbit16 = sad_per_bit16lut_12[qindex];
+ x->sadperbit4 = sad_per_bit4lut_12[qindex];
+ break;
+ }
+#else
+ (void)cpi;
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
+#endif // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
+ int i, bsize, segment_id;
+
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ const int qindex =
+ clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+ cm->y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
+
+ for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
+ // Threshold here seems unnecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[].
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
+
+ if (bsize >= BLOCK_8X8) {
+ for (i = 0; i < MAX_MODES; ++i)
+ rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+ ? rd->thresh_mult[i] * t / 4
+ : INT_MAX;
+ } else {
+ for (i = 0; i < MAX_REFS; ++i)
+ rd->threshes[segment_id][bsize][i] =
+ rd->thresh_mult_sub8x8[i] < thresh_max
+ ? rd->thresh_mult_sub8x8[i] * t / 4
+ : INT_MAX;
+ }
+ }
+ }
+}
+
+void vp9_build_inter_mode_cost(VP9_COMP *cpi) {
+ const VP9_COMMON *const cm = &cpi->common;
+ int i;
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+ vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i],
+ vp9_inter_mode_tree);
+ }
+}
+
+void vp9_initialize_rd_consts(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+
+ vpx_clear_system_state();
+
+ rd->RDDIV = RDDIV_BITS; // In bits (to multiply D by 128).
+ rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+ set_error_per_bit(x, rd->RDMULT);
+
+ x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+ cm->frame_type != KEY_FRAME)
+ ? 0
+ : 1;
+
+ set_block_thresholds(cm, rd);
+ set_partition_probs(cm, xd);
+
+ if (cpi->oxcf.pass == 1) {
+ if (!frame_is_intra_only(cm))
+ vp9_build_nmv_cost_table(
+ x->nmvjointcost,
+ cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+ &cm->fc->nmvc, cm->allow_high_precision_mv);
+ } else {
+ if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+ fill_token_costs(x->token_costs, cm->fc->coef_probs);
+
+ if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+ cm->frame_type == KEY_FRAME) {
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
+ vp9_partition_tree);
+ }
+
+ if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
+ cm->frame_type == KEY_FRAME) {
+ fill_mode_costs(cpi);
+
+ if (!frame_is_intra_only(cm)) {
+ vp9_build_nmv_cost_table(
+ x->nmvjointcost,
+ cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
+ &cm->fc->nmvc, cm->allow_high_precision_mv);
+ vp9_build_inter_mode_cost(cpi);
+ }
+ }
+ }
+}
+
+// NOTE: The tables below must be of the same size.
+
+// The functions described below are sampled at the four most significant
+// bits of x^2 + 8 / 256.
+
+// Normalized rate:
+// This table models the rate for a Laplacian source with given variance
+// when quantized with a uniform quantizer with given stepsize. The
+// closed form expression is:
+// Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+// where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+// and H(x) is the binary entropy function.
+static const int rate_tab_q10[] = {
+ 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
+ 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
+ 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
+ 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
+ 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963, 911, 864, 821, 781, 745,
+ 680, 623, 574, 530, 490, 455, 424, 395, 345, 304, 269, 239, 213,
+ 190, 171, 154, 126, 104, 87, 73, 61, 52, 44, 38, 28, 21,
+ 16, 12, 10, 8, 6, 5, 3, 2, 1, 1, 1, 0, 0,
+};
+
+// Normalized distortion:
+// This table models the normalized distortion for a Laplacian source
+// with given variance when quantized with a uniform quantizer
+// with given stepsize. The closed form expression is:
+// Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+// where x = qpstep / sqrt(variance).
+// Note the actual distortion is Dn * variance.
+static const int dist_tab_q10[] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5,
+ 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 21,
+ 24, 26, 29, 31, 34, 36, 39, 44, 49, 54, 59, 64, 69,
+ 73, 78, 88, 97, 106, 115, 124, 133, 142, 151, 167, 184, 200,
+ 215, 231, 245, 260, 274, 301, 327, 351, 375, 397, 418, 439, 458,
+ 495, 528, 559, 587, 613, 637, 659, 680, 717, 749, 777, 801, 823,
+ 842, 859, 874, 899, 919, 936, 949, 960, 969, 977, 983, 994, 1001,
+ 1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+};
+static const int xsq_iq_q10[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32,
+ 40, 48, 56, 64, 72, 80, 88, 96, 112,
+ 128, 144, 160, 176, 192, 208, 224, 256, 288,
+ 320, 352, 384, 416, 448, 480, 544, 608, 672,
+ 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
+ 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
+ 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
+ 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
+ 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
+ 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
+ 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
+ 180192, 196576, 212960, 229344, 245728,
+};
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+ const int tmp = (xsq_q10 >> 2) + 8;
+ const int k = get_msb(tmp) - 3;
+ const int xq = (k << 3) + ((tmp >> k) & 0x7);
+ const int one_q10 = 1 << 10;
+ const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+ const int b_q10 = one_q10 - a_q10;
+ *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+ *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+static const uint32_t MAX_XSQ_Q10 = 245727;
+
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
+ unsigned int qstep, int *rate,
+ int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ if (var == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ int d_q10, r_q10;
+ const uint64_t xsq_q10_64 =
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+ const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
+ model_rd_norm(xsq_q10, &r_q10, &d_q10);
+ *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
+ *dist = (var * (int64_t)d_q10 + 512) >> 10;
+ }
+}
+
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[16],
+ ENTROPY_CONTEXT t_left[16]) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+ const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const ENTROPY_CONTEXT *const above = pd->above_context;
+ const ENTROPY_CONTEXT *const left = pd->left_context;
+
+ int i;
+ switch (tx_size) {
+ case TX_4X4:
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+ break;
+ case TX_8X8:
+ for (i = 0; i < num_4x4_w; i += 2)
+ t_above[i] = !!*(const uint16_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 2)
+ t_left[i] = !!*(const uint16_t *)&left[i];
+ break;
+ case TX_16X16:
+ for (i = 0; i < num_4x4_w; i += 4)
+ t_above[i] = !!*(const uint32_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 4)
+ t_left[i] = !!*(const uint32_t *)&left[i];
+ break;
+ default:
+ assert(tx_size == TX_32X32);
+ for (i = 0; i < num_4x4_w; i += 8)
+ t_above[i] = !!*(const uint64_t *)&above[i];
+ for (i = 0; i < num_4x4_h; i += 8)
+ t_left[i] = !!*(const uint64_t *)&left[i];
+ break;
+ }
+}
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+ int i;
+ int zero_seen = 0;
+ int best_index = 0;
+ int best_sad = INT_MAX;
+ int this_sad = INT_MAX;
+ int max_mv = 0;
+ int near_same_nearest;
+ uint8_t *src_y_ptr = x->plane[0].src.buf;
+ uint8_t *ref_y_ptr;
+ const int num_mv_refs =
+ MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size);
+
+ MV pred_mv[3];
+ pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
+ pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
+ pred_mv[2] = x->pred_mv[ref_frame];
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+ near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
+ x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+
+ // Get the sad for each candidate reference mv.
+ for (i = 0; i < num_mv_refs; ++i) {
+ const MV *this_mv = &pred_mv[i];
+ int fp_row, fp_col;
+ if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue;
+ if (i == 1 && near_same_nearest) continue;
+ fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+ if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+ zero_seen |= (fp_row == 0 && fp_col == 0);
+
+ ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+ // Find sad for current vector.
+ this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+ ref_y_ptr, ref_y_stride);
+ // Note if it is the best so far.
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ best_index = i;
+ }
+ }
+
+ // Note the index of the mv that worked best in the reference list.
+ x->mv_best_ref_index[ref_frame] = best_index;
+ x->max_mv_context[ref_frame] = max_mv;
+ x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv) {
+ int i;
+
+ dst[0].buf = src->y_buffer;
+ dst[0].stride = src->y_stride;
+ dst[1].buf = src->u_buffer;
+ dst[2].buf = src->v_buffer;
+ dst[1].stride = dst[2].stride = src->uv_stride;
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
+ i ? scale_uv : scale, xd->plane[i].subsampling_x,
+ xd->plane[i].subsampling_y);
+ }
+}
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride) {
+ const int bw = b_width_log2_lookup[plane_bsize];
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+
+int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base) {
+ const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+ int ref_frame) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+ const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+ return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
+ ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
+ : NULL;
+}
+
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
+ const MODE_INFO *const mi = xd->mi[0];
+ const int ctx = get_pred_context_switchable_interp(xd);
+ return SWITCHABLE_INTERP_RATE_FACTOR *
+ cpi->switchable_interp_costs[ctx][mi->interp_filter];
+}
+
+void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
+ int i;
+ RD_OPT *const rd = &cpi->rd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+
+ // Set baseline threshold values.
+ for (i = 0; i < MAX_MODES; ++i)
+ rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
+
+ if (sf->adaptive_rd_thresh) {
+ rd->thresh_mult[THR_NEARESTMV] = 300;
+ rd->thresh_mult[THR_NEARESTG] = 300;
+ rd->thresh_mult[THR_NEARESTA] = 300;
+ } else {
+ rd->thresh_mult[THR_NEARESTMV] = 0;
+ rd->thresh_mult[THR_NEARESTG] = 0;
+ rd->thresh_mult[THR_NEARESTA] = 0;
+ }
+
+ rd->thresh_mult[THR_DC] += 1000;
+
+ rd->thresh_mult[THR_NEWMV] += 1000;
+ rd->thresh_mult[THR_NEWA] += 1000;
+ rd->thresh_mult[THR_NEWG] += 1000;
+
+ rd->thresh_mult[THR_NEARMV] += 1000;
+ rd->thresh_mult[THR_NEARA] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+ rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+
+ rd->thresh_mult[THR_TM] += 1000;
+
+ rd->thresh_mult[THR_COMP_NEARLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEWLA] += 2000;
+ rd->thresh_mult[THR_NEARG] += 1000;
+ rd->thresh_mult[THR_COMP_NEARGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEWGA] += 2000;
+
+ rd->thresh_mult[THR_ZEROMV] += 2000;
+ rd->thresh_mult[THR_ZEROG] += 2000;
+ rd->thresh_mult[THR_ZEROA] += 2000;
+ rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+ rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
+
+ rd->thresh_mult[THR_H_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_D45_PRED] += 2500;
+ rd->thresh_mult[THR_D135_PRED] += 2500;
+ rd->thresh_mult[THR_D117_PRED] += 2500;
+ rd->thresh_mult[THR_D153_PRED] += 2500;
+ rd->thresh_mult[THR_D207_PRED] += 2500;
+ rd->thresh_mult[THR_D63_PRED] += 2500;
+}
+
+void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
+ static const int thresh_mult[2][MAX_REFS] = {
+ { 2500, 2500, 2500, 4500, 4500, 2500 },
+ { 2000, 2000, 2000, 4000, 4000, 2000 }
+ };
+ RD_OPT *const rd = &cpi->rd;
+ const int idx = cpi->oxcf.mode == BEST;
+ memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
+}
+
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index) {
+ if (rd_thresh > 0) {
+ const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+ int mode;
+ for (mode = 0; mode < top_mode; ++mode) {
+ const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
+ const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
+ BLOCK_SIZE bs;
+ for (bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> 4);
+ } else {
+ *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+ }
+ }
+}
+
+int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ int qindex, int qdelta) {
+ // Reduce the intra cost penalty for small blocks (<=16x16).
+ int reduction_fac =
+ (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+
+ if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+ // Don't reduce intra cost penalty if estimated noise level is high.
+ reduction_fac = 0;
+
+ // Always use VPX_BITS_8 as input here because the penalty is applied
+ // to rate not distortion so we want a consistent penalty for all bit
+ // depths. If the actual bit depth were passed in here then the value
+ // retured by vp9_dc_quant() would scale with the bit depth and we would
+ // then need to apply inverse scaling to correct back to a bit depth
+ // independent rate penalty.
+ return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
new file mode 100644
index 0000000000..6c61ae514a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_RD_H_
+#define VPX_VP9_ENCODER_VP9_RD_H_
+
+#include <limits.h>
+
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, DM, R, D) \
+ ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM))
+#define RDCOST_NEG_R(RM, DM, R, D) \
+ ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT)
+#define RDCOST_NEG_D(RM, DM, R, D) \
+ ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+#define MAX_MODES 30
+#define MAX_REFS 6
+
+#define RD_THRESH_INIT_FACT 32
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
+#define VP9_DIST_SCALE_LOG2 4
+#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2)
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+ THR_NEARESTMV,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_DC,
+
+ THR_NEWMV,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_ZEROMV,
+ THR_ZEROG,
+ THR_ZEROA,
+
+ THR_COMP_NEARESTLA,
+ THR_COMP_NEARESTGA,
+
+ THR_TM,
+
+ THR_COMP_NEARLA,
+ THR_COMP_NEWLA,
+ THR_COMP_NEARGA,
+ THR_COMP_NEWGA,
+
+ THR_COMP_ZEROLA,
+ THR_COMP_ZEROGA,
+
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D207_PRED,
+ THR_D153_PRED,
+ THR_D63_PRED,
+ THR_D117_PRED,
+ THR_D45_PRED,
+} THR_MODES;
+
+typedef enum {
+ THR_LAST,
+ THR_GOLD,
+ THR_ALTR,
+ THR_COMP_LA,
+ THR_COMP_GA,
+ THR_INTRA,
+} THR_MODES_SUB8X8;
+
+typedef struct {
+ // RD multiplier control factors added for Vizier project.
+ double rd_mult_inter_qp_fac;
+ double rd_mult_arf_qp_fac;
+ double rd_mult_key_qp_fac;
+} RD_CONTROL;
+
+typedef struct RD_OPT {
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact to
+ // pick a threshold.
+ int thresh_mult[MAX_MODES];
+ int thresh_mult_sub8x8[MAX_REFS];
+
+ int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
+
+ int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+
+ int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+ int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
+
+ int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+ int RDMULT;
+ int RDDIV;
+ double r0;
+} RD_OPT;
+
+typedef struct RD_COST {
+ int rate;
+ int64_t dist;
+ int64_t rdcost;
+} RD_COST;
+
+// Reset the rate distortion cost values to maximum (invalid) value.
+void vp9_rd_cost_reset(RD_COST *rd_cost);
+// Initialize the rate distortion cost values to zero.
+void vp9_rd_cost_init(RD_COST *rd_cost);
+// It supports negative rate and dist, which is different from RDCOST().
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist);
+// Update the cost value based on its rate and distortion.
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost);
+
+struct TileInfo;
+struct TileDataEnc;
+struct VP9_COMP;
+struct macroblock;
+
+void vp9_init_rd_parameters(struct VP9_COMP *cpi);
+
+int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
+
+int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
+
+int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta);
+
+void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
+
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
+
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
+ unsigned int qstep, int *rate, int64_t *dist);
+
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+ const MACROBLOCKD *const xd);
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride);
+
+int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base);
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+ int ref_frame);
+
+void vp9_init_me_luts(void);
+
+void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[16],
+ ENTROPY_CONTEXT t_left[16]);
+
+void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
+
+void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
+
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+ const int *const thresh_fact) {
+ return best_rd < ((int64_t)thresh * (*thresh_fact) >> 5) || thresh == INT_MAX;
+}
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+ x->errorperbit = rdmult >> RD_EPB_SHIFT;
+ x->errorperbit += (x->errorperbit == 0);
+}
+
+void vp9_mv_pred(struct VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size);
+
+void vp9_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv);
+
+int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi,
+ BLOCK_SIZE bsize, int qindex, int qdelta);
+
+unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref, BLOCK_SIZE bs);
+unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref, BLOCK_SIZE bs,
+ int bd);
+unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd);
+#endif
+
+void vp9_build_inter_mode_cost(struct VP9_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_RD_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
new file mode 100644
index 0000000000..f051c62791
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@@ -0,0 +1,4932 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/system_state.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_variance.h"
+#endif
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_rdopt.h"
+
+#define LAST_FRAME_MODE_MASK \
+ ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK \
+ ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK \
+ ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK ((1 << ALTREF_FRAME) | 0x01)
+
+#define MIN_EARLY_TERM_INDEX 3
+#define NEW_MV_DISCOUNT_FACTOR 8
+
+typedef struct {
+ PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+struct rdcost_block_args {
+ const VP9_COMP *cpi;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[16];
+ ENTROPY_CONTEXT t_left[16];
+ int this_rate;
+ int64_t this_dist;
+ int64_t this_sse;
+ int64_t this_rd;
+ int64_t best_rd;
+ int exit_early;
+ int use_fast_coef_costing;
+ const ScanOrder *so;
+ uint8_t skippable;
+ struct buf_2d *this_recon;
+};
+
+#define LAST_NEW_MV_INDEX 6
+
+#if !CONFIG_REALTIME_ONLY
+static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
+ { NEARESTMV, { LAST_FRAME, NONE } },
+ { NEARESTMV, { ALTREF_FRAME, NONE } },
+ { NEARESTMV, { GOLDEN_FRAME, NONE } },
+
+ { DC_PRED, { INTRA_FRAME, NONE } },
+
+ { NEWMV, { LAST_FRAME, NONE } },
+ { NEWMV, { ALTREF_FRAME, NONE } },
+ { NEWMV, { GOLDEN_FRAME, NONE } },
+
+ { NEARMV, { LAST_FRAME, NONE } },
+ { NEARMV, { ALTREF_FRAME, NONE } },
+ { NEARMV, { GOLDEN_FRAME, NONE } },
+
+ { ZEROMV, { LAST_FRAME, NONE } },
+ { ZEROMV, { GOLDEN_FRAME, NONE } },
+ { ZEROMV, { ALTREF_FRAME, NONE } },
+
+ { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { TM_PRED, { INTRA_FRAME, NONE } },
+
+ { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+ { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { H_PRED, { INTRA_FRAME, NONE } },
+ { V_PRED, { INTRA_FRAME, NONE } },
+ { D135_PRED, { INTRA_FRAME, NONE } },
+ { D207_PRED, { INTRA_FRAME, NONE } },
+ { D153_PRED, { INTRA_FRAME, NONE } },
+ { D63_PRED, { INTRA_FRAME, NONE } },
+ { D117_PRED, { INTRA_FRAME, NONE } },
+ { D45_PRED, { INTRA_FRAME, NONE } },
+};
+
+static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
+ { { LAST_FRAME, NONE } }, { { GOLDEN_FRAME, NONE } },
+ { { ALTREF_FRAME, NONE } }, { { LAST_FRAME, ALTREF_FRAME } },
+ { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+};
+#endif // !CONFIG_REALTIME_ONLY
+
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
+ int min_plane, int max_plane) {
+ int i;
+
+ for (i = min_plane; i < max_plane; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
+
+ p->coeff = ctx->coeff_pbuf[i][m];
+ p->qcoeff = ctx->qcoeff_pbuf[i][m];
+ pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
+ p->eobs = ctx->eobs_pbuf[i][m];
+
+ ctx->coeff_pbuf[i][m] = ctx->coeff_pbuf[i][n];
+ ctx->qcoeff_pbuf[i][m] = ctx->qcoeff_pbuf[i][n];
+ ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
+ ctx->eobs_pbuf[i][m] = ctx->eobs_pbuf[i][n];
+
+ ctx->coeff_pbuf[i][n] = p->coeff;
+ ctx->qcoeff_pbuf[i][n] = p->qcoeff;
+ ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
+ ctx->eobs_pbuf[i][n] = p->eobs;
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+ VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+ MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+ int64_t best_rd) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int i;
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ const int ref = xd->mi[0]->ref_frame[0];
+ unsigned int sse;
+ unsigned int var = 0;
+ int64_t total_sse = 0;
+ int skip_flag = 1;
+ const int shift = 6;
+ const int dequant_shift =
+#if CONFIG_VP9_HIGHBITDEPTH
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ 3;
+
+ x->pred_sse[ref] = 0;
+
+ // Build prediction signal, compute stats and RD cost on per-plane basis
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ struct macroblock_plane *const p = &x->plane[i];
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
+ const int64_t dc_thr = p->quant_thred[0] >> shift;
+ const int64_t ac_thr = p->quant_thred[1] >> shift;
+ unsigned int sum_sse = 0;
+ // The low thresholds are used to measure if the prediction errors are
+ // low enough so that we can skip the mode search.
+ const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
+ const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
+ int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
+ int idx, idy;
+ int lw = b_width_log2_lookup[unit_size] + 2;
+ int lh = b_height_log2_lookup[unit_size] + 2;
+ unsigned int qstep;
+ unsigned int nlog2;
+ int64_t dist = 0;
+
+ // Build inter predictor
+ vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+ // Compute useful stats
+ for (idy = 0; idy < bh; ++idy) {
+ for (idx = 0; idx < bw; ++idx) {
+ uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
+ uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
+ int block_idx = (idy << 1) + idx;
+ int low_err_skip = 0;
+
+ var = cpi->fn_ptr[unit_size].vf(src, p->src.stride, dst, pd->dst.stride,
+ &sse);
+ x->bsse[(i << 2) + block_idx] = sse;
+ sum_sse += sse;
+
+ x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
+ if (!x->select_tx_size) {
+ // Check if all ac coefficients can be quantized to zero.
+ if (var < ac_thr || var == 0) {
+ x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
+
+ // Check if dc coefficient can be quantized to zero.
+ if (sse - var < dc_thr || sse == var) {
+ x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
+
+ if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
+ low_err_skip = 1;
+ }
+ }
+ }
+
+ if (skip_flag && !low_err_skip) skip_flag = 0;
+
+ if (i == 0) x->pred_sse[ref] += sse;
+ }
+ }
+
+ total_sse += sum_sse;
+ qstep = pd->dequant[1] >> dequant_shift;
+ nlog2 = num_pels_log2_lookup[bs];
+
+ // Fast approximate the modelling function.
+ if (cpi->sf.simple_model_rd_from_var) {
+ int64_t rate;
+ if (qstep < 120)
+ rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
+ else
+ rate = 0;
+ dist = ((int64_t)sum_sse * qstep) >> 8;
+ rate_sum += rate;
+ } else {
+ int rate;
+ vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+ rate_sum += rate;
+ }
+ dist_sum += dist;
+ if (do_earlyterm) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+ dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+ return 1;
+ }
+ }
+ *skip_txfm_sb = skip_flag;
+ *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+ return 0;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size,
+ int64_t *ssz, int bd) {
+ if (bd == 8) {
+ return vp9_block_error(coeff, dqcoeff, block_size, ssz);
+ } else {
+ return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += coeff[i] * coeff[i];
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ int block_size) {
+ int i;
+ int64_t error = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ }
+
+ return error;
+}
+
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+static const int16_t band_counts[TX_SIZES][8] = {
+ { 1, 2, 3, 4, 3, 16 - 13, 0 },
+ { 1, 2, 3, 4, 11, 64 - 21, 0 },
+ { 1, 2, 3, 4, 11, 256 - 21, 0 },
+ { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+};
+static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+ int pt, const int16_t *scan, const int16_t *nb,
+ int use_fast_coef_costing) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ const struct macroblock_plane *p = &x->plane[plane];
+ const PLANE_TYPE type = get_plane_type(plane);
+ const int16_t *band_count = &band_counts[tx_size][1];
+ const int eob = p->eobs[block];
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ x->token_costs[tx_size][type][is_inter_block(mi)];
+ uint8_t token_cache[32 * 32];
+ int cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+ const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
+ // Check for consistency of tx_size with mode info
+ assert(type == PLANE_TYPE_Y
+ ? mi->tx_size == tx_size
+ : get_uv_tx_size(mi, &xd->plane[plane]) == tx_size);
+
+ if (eob == 0) {
+ // single eob token
+ cost = token_costs[0][0][pt][EOB_TOKEN];
+ } else {
+ if (use_fast_coef_costing) {
+ int band_left = *band_count++;
+ int c;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t prev_t;
+ cost = vp9_get_token_cost(v, &prev_t, cat6_high_cost);
+ cost += (*token_costs)[0][pt][prev_t];
+
+ token_cache[0] = vp9_pt_energy_class[prev_t];
+ ++token_costs;
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+ int16_t t;
+
+ v = qcoeff[rc];
+ cost += vp9_get_token_cost(v, &t, cat6_high_cost);
+ cost += (*token_costs)[!prev_t][!prev_t][t];
+ prev_t = t;
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+ }
+
+ // eob token
+ if (band_left) cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+
+ } else { // !use_fast_coef_costing
+ int band_left = *band_count++;
+ int c;
+
+ // dc token
+ int v = qcoeff[0];
+ int16_t tok;
+ unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+ cost = vp9_get_token_cost(v, &tok, cat6_high_cost);
+ cost += (*token_costs)[0][pt][tok];
+
+ token_cache[0] = vp9_pt_energy_class[tok];
+ ++token_costs;
+
+ tok_cost_ptr = &((*token_costs)[!tok]);
+
+ // ac tokens
+ for (c = 1; c < eob; c++) {
+ const int rc = scan[c];
+
+ v = qcoeff[rc];
+ cost += vp9_get_token_cost(v, &tok, cat6_high_cost);
+ pt = get_coef_context(nb, token_cache, c);
+ cost += (*tok_cost_ptr)[pt][tok];
+ token_cache[rc] = vp9_pt_energy_class[tok];
+ if (!--band_left) {
+ band_left = *band_count++;
+ ++token_costs;
+ }
+ tok_cost_ptr = &((*token_costs)[!tok]);
+ }
+
+ // eob token
+ if (band_left) {
+ pt = get_coef_context(nb, token_cache, c);
+ cost += (*token_costs)[0][pt][EOB_TOKEN];
+ }
+ }
+ }
+
+ return cost;
+}
+
+// Copy all visible 4x4s in the transform block.
+static void copy_block_visible(const MACROBLOCKD *xd,
+ const struct macroblockd_plane *const pd,
+ const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+ const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+ int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
+ pd->subsampling_x, blk_col);
+ int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
+ pd->subsampling_y, blk_row);
+ const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ if (tx_bsize == BLOCK_4X4 ||
+ (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+ const int w = tx_4x4_w << 2;
+ const int h = tx_4x4_h << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (is_highbd) {
+ vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride,
+ CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0,
+ 0, 0, w, h, xd->bd);
+ } else {
+#endif
+ vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w,
+ h);
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif
+ } else {
+ int r, c;
+ int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+ int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_r; ++r) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_c; ++c) {
+ const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4;
+ uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (is_highbd) {
+ vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+ CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+ NULL, 0, 0, 0, 0, 4, 4, xd->bd);
+ } else {
+#endif
+ vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0,
+ 0, 0, 0, 4, 4);
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif
+ }
+ }
+ }
+ (void)is_highbd;
+}
+
+// Compute the pixel domain sum square error on all visible 4x4s in the
+// transform block.
+static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
+ const struct macroblockd_plane *const pd,
+ const uint8_t *src, const int src_stride,
+ const uint8_t *dst, const int dst_stride, int blk_row,
+ int blk_col, const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ unsigned int sse = 0;
+ const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+ const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+ const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+ const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+ int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
+ pd->subsampling_x, blk_col);
+ int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
+ pd->subsampling_y, blk_row);
+ if (tx_bsize == BLOCK_4X4 ||
+ (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+ cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ } else {
+ const vpx_variance_fn_t vf_4x4 = cpi->fn_ptr[BLOCK_4X4].vf;
+ int r, c;
+ unsigned this_sse = 0;
+ int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+ int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+ sse = 0;
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_r; ++r) {
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_c; ++c) {
+ vf_4x4(src + r * src_stride * 4 + c * 4, src_stride,
+ dst + r * dst_stride * 4 + c * 4, dst_stride, &this_sse);
+ sse += this_sse;
+ }
+ }
+ }
+ return sse;
+}
+
+static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
+ BLOCK_SIZE plane_bsize, int block, int blk_row,
+ int blk_col, TX_SIZE tx_size, int64_t *out_dist,
+ int64_t *out_sse, struct buf_2d *out_recon,
+ int sse_calc_done) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+
+ if (!out_recon && x->block_tx_domain && eob) {
+ const int ss_txfrm_size = tx_size << 1;
+ int64_t this_sse;
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+ const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+ *out_dist = vp9_highbd_block_error_dispatch(
+ coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >>
+ shift;
+#else
+ *out_dist =
+ vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >>
+ shift;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ *out_sse = this_sse >> shift;
+
+ if (x->skip_encode && !is_inter_block(xd->mi[0])) {
+ // TODO(jingning): tune the model to better capture the distortion.
+ const int64_t mean_quant_error =
+ (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
+#if CONFIG_VP9_HIGHBITDEPTH
+ (shift + 2 + (bd - 8) * 2);
+#else
+ (shift + 2);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ *out_dist += (mean_quant_error >> 4);
+ *out_sse += mean_quant_error;
+ }
+ } else {
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int src_idx = 4 * (blk_row * src_stride + blk_col);
+ const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
+ const uint8_t *src = &p->src.buf[src_idx];
+ const uint8_t *dst = &pd->dst.buf[dst_idx];
+ uint8_t *out_recon_ptr = 0;
+
+ const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ unsigned int tmp;
+
+ if (sse_calc_done) {
+ tmp = (unsigned int)(*out_sse);
+ } else {
+ tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+ blk_col, plane_bsize, tx_bsize);
+ }
+ *out_sse = (int64_t)tmp * 16;
+ if (out_recon) {
+ const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
+ out_recon_ptr = &out_recon->buf[out_recon_idx];
+ copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr,
+ out_recon->stride, blk_row, blk_col, plane_bsize,
+ tx_bsize);
+ }
+
+ if (eob) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
+ uint8_t *recon = (uint8_t *)recon16;
+#else
+ DECLARE_ALIGNED(16, uint8_t, recon[1024]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
+ 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
+ if (xd->lossless) {
+ vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd);
+ break;
+ default:
+ assert(tx_size == TX_32X32);
+ vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd);
+ break;
+ }
+ }
+ recon = CONVERT_TO_BYTEPTR(recon16);
+ } else {
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
+ switch (tx_size) {
+ case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break;
+ case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break;
+ case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break;
+ default:
+ assert(tx_size == TX_4X4);
+ // this is like vp9_short_idct4x4 but has a special case around
+ // eob<=1, which is significant (not just an optimization) for
+ // the lossless case.
+ x->inv_txfm_add(dqcoeff, recon, 32, eob);
+ break;
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col,
+ plane_bsize, tx_bsize);
+ if (out_recon) {
+ copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+ }
+ }
+
+ *out_dist = (int64_t)tmp * 16;
+ }
+}
+
+static int rate_block(int plane, int block, TX_SIZE tx_size, int coeff_ctx,
+ struct rdcost_block_args *args) {
+ return cost_coeffs(args->x, plane, block, tx_size, coeff_ctx, args->so->scan,
+ args->so->neighbors, args->use_fast_coef_costing);
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ int64_t rd1, rd2, rd;
+ int rate;
+ int64_t dist = INT64_MAX;
+ int64_t sse = INT64_MAX;
+ const int coeff_ctx =
+ combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
+ struct buf_2d *recon = args->this_recon;
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+ const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+ const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+ int sse_calc_done = 0;
+#if CONFIG_MISMATCH_DEBUG
+ struct encode_b_args encode_b_arg = {
+ x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+ &sse, args->t_above, args->t_left, &mi->skip,
+ 0, // mi_row
+ 0, // mi_col
+ 0 // output_enabled
+ };
+#else
+ struct encode_b_args encode_b_arg = {
+ x, enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+ &sse, args->t_above, args->t_left, &mi->skip
+ };
+#endif
+
+ if (args->exit_early) return;
+
+ if (!is_inter_block(mi)) {
+ vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ &encode_b_arg);
+ if (recon) {
+ uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
+ copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+ }
+ if (x->block_tx_domain) {
+ dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+ tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
+ } else {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int src_stride = p->src.stride;
+ const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+ unsigned int tmp;
+ if (!sse_calc_done) {
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *diff =
+ &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+ int visible_width, visible_height;
+ sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+ plane_bsize, tx_bsize, &visible_width,
+ &visible_height);
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
+ sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ sse = sse * 16;
+ tmp = pixel_sse(args->cpi, xd, pd, src, src_stride, dst, dst_stride,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+ dist = (int64_t)tmp * 16;
+ }
+ } else {
+ int skip_txfm_flag = SKIP_TXFM_NONE;
+ if (max_txsize_lookup[plane_bsize] == tx_size)
+ skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
+
+ // This reduces the risk of bad perceptual quality due to bad prediction.
+ // We always force the encoder to perform transform and quantization.
+ if (!args->cpi->sf.allow_skip_txfm_ac_dc &&
+ skip_txfm_flag == SKIP_TXFM_AC_DC) {
+ skip_txfm_flag = SKIP_TXFM_NONE;
+ }
+
+ if (skip_txfm_flag == SKIP_TXFM_NONE ||
+ (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int16_t *const diff =
+ &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+ const int use_trellis_opt =
+ do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+ tx_size, &encode_b_arg);
+ // full forward transform and quantization
+ vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+ if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+ dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
+ tx_size, &dist, &sse, recon, sse_calc_done);
+ } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
+ // compute DC coefficient
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+ vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size);
+ sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
+ dist = sse;
+ if (x->plane[plane].eobs[block]) {
+ const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+ const int64_t resd_sse = coeff[0] - dqcoeff[0];
+ int64_t dc_correct = orig_sse - resd_sse * resd_sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+ dc_correct >>= ((xd->bd - 8) * 2);
+#endif
+ if (tx_size != TX_32X32) dc_correct >>= 2;
+
+ dist = VPXMAX(0, sse - dc_correct);
+ }
+ } else {
+ assert(0 && "allow_skip_txfm_ac_dc does not allow SKIP_TXFM_AC_DC.");
+ }
+ }
+
+ rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
+ if (args->this_rd + rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+
+ rate = rate_block(plane, block, tx_size, coeff_ctx, args);
+ args->t_above[blk_col] = (x->plane[plane].eobs[block] > 0) ? 1 : 0;
+ args->t_left[blk_row] = (x->plane[plane].eobs[block] > 0) ? 1 : 0;
+ rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse);
+
+ // TODO(jingning): temporarily enabled only for luma component
+ rd = VPXMIN(rd1, rd2);
+ if (plane == 0) {
+ x->zcoeff_blk[tx_size][block] =
+ !x->plane[plane].eobs[block] ||
+ (x->sharpness == 0 && rd1 > rd2 && !xd->lossless);
+ x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
+ }
+
+ args->this_rate += rate;
+ args->this_dist += dist;
+ args->this_sse += sse;
+ args->this_rd += rd;
+
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+
+ args->skippable &= !x->plane[plane].eobs[block];
+}
+
+static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skippable, int64_t *sse,
+ int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, int use_fast_coef_costing,
+ struct buf_2d *recon) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ vp9_zero(args);
+ args.cpi = cpi;
+ args.x = x;
+ args.best_rd = ref_best_rd;
+ args.use_fast_coef_costing = use_fast_coef_costing;
+ args.skippable = 1;
+ args.this_recon = recon;
+
+ if (plane == 0) xd->mi[0]->tx_size = tx_size;
+
+ vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+ args.so = get_scan(xd, tx_size, get_plane_type(plane), 0);
+
+ vp9_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+ &args);
+ if (args.exit_early) {
+ *rate = INT_MAX;
+ *distortion = INT64_MAX;
+ *sse = INT64_MAX;
+ *skippable = 0;
+ } else {
+ *distortion = args.this_dist;
+ *rate = args.this_rate;
+ *sse = args.this_sse;
+ *skippable = args.skippable;
+ }
+}
+
+static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skip, int64_t *sse,
+ int64_t ref_best_rd, BLOCK_SIZE bs,
+ struct buf_2d *recon) {
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ VP9_COMMON *const cm = &cpi->common;
+ const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+
+ mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+
+ txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
+ mi->tx_size, cpi->sf.use_fast_coef_costing, recon);
+}
+
+static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skip,
+ int64_t *psse, int64_t ref_best_rd,
+ BLOCK_SIZE bs, struct buf_2d *recon) {
+ const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+ int r[TX_SIZES][2], s[TX_SIZES];
+ int64_t d[TX_SIZES], sse[TX_SIZES];
+ int64_t rd[TX_SIZES][2] = { { INT64_MAX, INT64_MAX },
+ { INT64_MAX, INT64_MAX },
+ { INT64_MAX, INT64_MAX },
+ { INT64_MAX, INT64_MAX } };
+ int n;
+ int s0, s1;
+ int64_t best_rd = ref_best_rd;
+ TX_SIZE best_tx = max_tx_size;
+ int start_tx, end_tx;
+ const int tx_size_ctx = get_tx_size_context(xd);
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]);
+ uint8_t *recon_buf[TX_SIZES];
+ for (n = 0; n < TX_SIZES; ++n) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]);
+ } else {
+ recon_buf[n] = (uint8_t *)recon_buf16[n];
+ }
+ }
+#else
+ DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ assert(skip_prob > 0);
+ s0 = vp9_cost_bit(skip_prob, 0);
+ s1 = vp9_cost_bit(skip_prob, 1);
+
+ if (cm->tx_mode == TX_MODE_SELECT) {
+ start_tx = max_tx_size;
+ end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0);
+ if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx);
+ } else {
+ TX_SIZE chosen_tx_size =
+ VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
+ start_tx = chosen_tx_size;
+ end_tx = chosen_tx_size;
+ }
+
+ for (n = start_tx; n >= end_tx; n--) {
+ const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
+ if (recon) {
+ struct buf_2d this_recon;
+ this_recon.buf = recon_buf[n];
+ this_recon.stride = recon->stride;
+ txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+ n, cpi->sf.use_fast_coef_costing, &this_recon);
+ } else {
+ txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+ n, cpi->sf.use_fast_coef_costing, 0);
+ }
+ r[n][1] = r[n][0];
+ if (r[n][0] < INT_MAX) {
+ r[n][1] += r_tx_size;
+ }
+ if (d[n] == INT64_MAX || r[n][0] == INT_MAX) {
+ rd[n][0] = rd[n][1] = INT64_MAX;
+ } else if (s[n]) {
+ if (is_inter_block(mi)) {
+ rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+ r[n][1] -= r_tx_size;
+ } else {
+ rd[n][0] = RDCOST(x->rdmult, x->rddiv, s1, sse[n]);
+ rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size, sse[n]);
+ }
+ } else {
+ rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+ rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+ }
+
+ if (is_inter_block(mi) && !xd->lossless && !s[n] && sse[n] != INT64_MAX) {
+ rd[n][0] = VPXMIN(rd[n][0], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+ rd[n][1] = VPXMIN(rd[n][1], RDCOST(x->rdmult, x->rddiv, s1, sse[n]));
+ }
+
+ // Early termination in transform size search.
+ if (cpi->sf.tx_size_search_breakout &&
+ (rd[n][1] == INT64_MAX ||
+ (n < (int)max_tx_size && rd[n][1] > rd[n + 1][1]) || s[n] == 1))
+ break;
+
+ if (rd[n][1] < best_rd) {
+ best_tx = n;
+ best_rd = rd[n][1];
+ }
+ }
+ mi->tx_size = best_tx;
+
+ *distortion = d[mi->tx_size];
+ *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT];
+ *skip = s[mi->tx_size];
+ *psse = sse[mi->tx_size];
+ if (recon) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ memcpy(CONVERT_TO_SHORTPTR(recon->buf),
+ CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]),
+ 64 * 64 * sizeof(uint16_t));
+ } else {
+#endif
+ memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+ }
+#endif
+ }
+}
+
+static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skip, int64_t *psse,
+ BLOCK_SIZE bs, int64_t ref_best_rd,
+ struct buf_2d *recon) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int64_t sse;
+ int64_t *ret_sse = psse ? psse : &sse;
+
+ assert(bs == xd->mi[0]->sb_type);
+
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
+ choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
+ bs, recon);
+ } else {
+ choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
+ bs, recon);
+ }
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+ PREDICTION_MODE best_intra_mode) {
+ if (mode == D117_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D63_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D207_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D153_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
+}
+
+static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
+ int col, PREDICTION_MODE *best_mode,
+ const int *bmode_costs, ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l, int *bestrate,
+ int *bestratey, int64_t *bestdistortion,
+ BLOCK_SIZE bsize, int64_t rd_thresh) {
+ PREDICTION_MODE mode;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int64_t best_rd = rd_thresh;
+ struct macroblock_plane *p = &x->plane[0];
+ struct macroblockd_plane *pd = &xd->plane[0];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *src_init = &p->src.buf[row * 4 * src_stride + col * 4];
+ uint8_t *dst_init = &pd->dst.buf[row * 4 * src_stride + col * 4];
+ ENTROPY_CONTEXT ta[2], tempa[2];
+ ENTROPY_CONTEXT tl[2], templ[2];
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ uint8_t best_dst[8 * 8];
+#if CONFIG_VP9_HIGHBITDEPTH
+ uint16_t best_dst16[8 * 8];
+#endif
+ memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+ memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
+
+ xd->mi[0]->tx_size = TX_4X4;
+
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ int64_t this_rd;
+ int ratey = 0;
+ int64_t distortion = 0;
+ int rate = bmode_costs[mode];
+
+ if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue;
+
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode)) continue;
+ }
+
+ memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+ memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+ const int block = (row + idy) * 2 + (col + idx);
+ const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+ uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+ uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
+ int16_t *const src_diff =
+ vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ xd->mi[0]->bmi[block].as_mode = mode;
+ vp9_predict_intra_block(xd, 1, TX_4X4, mode,
+ x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride, dst,
+ dst_stride, col + idx, row + idy, 0);
+ vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
+ dst_stride, xd->bd);
+ if (xd->lossless) {
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+ vp9_highbd_fwht4x4(src_diff, coeff, 8);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
+ tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+ vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,
+ dst_stride, p->eobs[block], xd->bd);
+ } else {
+ int64_t unused;
+ const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+ const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+ if (tx_type == DCT_DCT)
+ vpx_highbd_fdct4x4(src_diff, coeff, 8);
+ else
+ vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+ eob, so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
+ distortion += vp9_highbd_block_error_dispatch(
+ coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16,
+ &unused, xd->bd) >>
+ 2;
+ tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next_highbd;
+ vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
+ dst16, dst_stride, p->eobs[block], xd->bd);
+ }
+ }
+ }
+
+ rate += ratey;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+ memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+ memcpy(best_dst16 + idy * 8,
+ CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ }
+ }
+ next_highbd : {}
+ }
+ if (best_rd >= rd_thresh || x->skip_encode) return best_rd;
+
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
+ memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+ best_dst16 + idy * 8, num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+ }
+
+ return best_rd;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ int64_t this_rd;
+ int ratey = 0;
+ int64_t distortion = 0;
+ int rate = bmode_costs[mode];
+
+ if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue;
+
+ // Only do the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(mode, *best_mode)) continue;
+ }
+
+ memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+ memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
+ const int block = (row + idy) * 2 + (col + idx);
+ const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
+ uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
+ int16_t *const src_diff =
+ vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ xd->mi[0]->bmi[block].as_mode = mode;
+ vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
+ x->skip_encode ? src_stride : dst_stride, dst,
+ dst_stride, col + idx, row + idy, 0);
+ vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+
+ if (xd->lossless) {
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+ vp9_fwht4x4(src_diff, coeff, 8);
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
+ tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next;
+ vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
+ p->eobs[block]);
+ } else {
+ int64_t unused;
+ const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+ const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
+ const int coeff_ctx =
+ combine_entropy_contexts(tempa[idx], templ[idy]);
+ vp9_fht4x4(src_diff, coeff, 8, tx_type);
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
+ ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
+ tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
+ distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
+ 16, &unused) >>
+ 2;
+ if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+ goto next;
+ vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
+ dst_stride, p->eobs[block]);
+ }
+ }
+ }
+
+ rate += ratey;
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+ if (this_rd < best_rd) {
+ *bestrate = rate;
+ *bestratey = ratey;
+ *bestdistortion = distortion;
+ best_rd = this_rd;
+ *best_mode = mode;
+ memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+ memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+ memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+ num_4x4_blocks_wide * 4);
+ }
+ next : {}
+ }
+
+ if (best_rd >= rd_thresh || x->skip_encode) return best_rd;
+
+ for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+ memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+ num_4x4_blocks_wide * 4);
+
+ return best_rd;
+}
+
+static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
+ int *rate, int *rate_y,
+ int64_t *distortion,
+ int64_t best_rd) {
+ int i, j;
+ const MACROBLOCKD *const xd = &mb->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ int idx, idy;
+ int cost = 0;
+ int64_t total_distortion = 0;
+ int tot_rate_y = 0;
+ int64_t total_rd = 0;
+ const int *bmode_costs = cpi->mbmode_cost;
+
+ // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ PREDICTION_MODE best_mode = DC_PRED;
+ int r = INT_MAX, ry = INT_MAX;
+ int64_t d = INT64_MAX, this_rd = INT64_MAX;
+ i = idy * 2 + idx;
+ if (cpi->common.frame_type == KEY_FRAME) {
+ const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
+ const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
+
+ bmode_costs = cpi->y_mode_costs[A][L];
+ }
+
+ this_rd = rd_pick_intra4x4block(
+ cpi, mb, idy, idx, &best_mode, bmode_costs,
+ xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
+ &ry, &d, bsize, best_rd - total_rd);
+
+ if (this_rd >= best_rd - total_rd) return INT64_MAX;
+
+ total_rd += this_rd;
+ cost += r;
+ total_distortion += d;
+ tot_rate_y += ry;
+
+ mic->bmi[i].as_mode = best_mode;
+ for (j = 1; j < num_4x4_blocks_high; ++j)
+ mic->bmi[i + j * 2].as_mode = best_mode;
+ for (j = 1; j < num_4x4_blocks_wide; ++j)
+ mic->bmi[i + j].as_mode = best_mode;
+
+ if (total_rd >= best_rd) return INT64_MAX;
+ }
+ }
+
+ *rate = cost;
+ *rate_y = tot_rate_y;
+ *distortion = total_distortion;
+ mic->mode = mic->bmi[3].as_mode;
+
+ return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
+}
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int64_t best_rd) {
+ PREDICTION_MODE mode;
+ PREDICTION_MODE mode_selected = DC_PRED;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mic = xd->mi[0];
+ int this_rate, this_rate_tokenonly, s;
+ int64_t this_distortion, this_rd;
+ TX_SIZE best_tx = TX_4X4;
+ int *bmode_costs;
+ const MODE_INFO *above_mi = xd->above_mi;
+ const MODE_INFO *left_mi = xd->left_mi;
+ const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+ const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+ bmode_costs = cpi->y_mode_costs[A][L];
+
+ memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+ /* Y Search for intra prediction mode */
+ for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+ if (cpi->sf.use_nonrd_pick_mode) {
+ // These speed features are turned on in hybrid non-RD and RD mode
+ // for key frame coding in the context of real-time setting.
+ if (conditional_skipintra(mode, mode_selected)) continue;
+ if (*skippable) break;
+ }
+
+ mic->mode = mode;
+
+ super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
+ bsize, best_rd, /*recon=*/NULL);
+
+ if (this_rate_tokenonly == INT_MAX) continue;
+
+ this_rate = this_rate_tokenonly + bmode_costs[mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+ if (this_rd < best_rd) {
+ mode_selected = mode;
+ best_rd = this_rd;
+ best_tx = mic->tx_size;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ }
+ }
+
+ mic->mode = mode_selected;
+ mic->tx_size = best_tx;
+
+ return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int64_t *distortion, int *skippable, int64_t *sse,
+ BLOCK_SIZE bsize, int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ const TX_SIZE uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
+ int plane;
+ int pnrate = 0, pnskip = 1;
+ int64_t pndist = 0, pnsse = 0;
+ int is_cost_valid = 1;
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ if (is_inter_block(mi) && is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+ vp9_subtract_plane(x, bsize, plane);
+ }
+
+ *rate = 0;
+ *distortion = 0;
+ *sse = 0;
+ *skippable = 1;
+
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
+ plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
+ /*recon=*/NULL);
+ if (pnrate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+ *rate += pnrate;
+ *distortion += pndist;
+ *sse += pnsse;
+ *skippable &= pnskip;
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ *rate = INT_MAX;
+ *distortion = INT64_MAX;
+ *sse = INT64_MAX;
+ *skippable = 0;
+ }
+
+ return is_cost_valid;
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ PICK_MODE_CONTEXT *ctx, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ TX_SIZE max_tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ PREDICTION_MODE mode;
+ PREDICTION_MODE mode_selected = DC_PRED;
+ int64_t best_rd = INT64_MAX, this_rd;
+ int this_rate_tokenonly, this_rate, s;
+ int64_t this_distortion, this_sse;
+
+ memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue;
+#if CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+ if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) &&
+ (xd->above_mi == NULL || xd->left_mi == NULL) && need_top_left[mode])
+ continue;
+#endif // CONFIG_BETTER_HW_COMPATIBILITY && CONFIG_VP9_HIGHBITDEPTH
+
+ xd->mi[0]->uv_mode = mode;
+
+ if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+ &this_sse, bsize, best_rd))
+ continue;
+ this_rate =
+ this_rate_tokenonly +
+ cpi->intra_uv_mode_cost[cpi->common.frame_type][xd->mi[0]->mode][mode];
+ this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+ if (this_rd < best_rd) {
+ mode_selected = mode;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ if (!x->select_tx_size) swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
+ }
+ }
+
+ xd->mi[0]->uv_mode = mode_selected;
+ return best_rd;
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize) {
+ const VP9_COMMON *cm = &cpi->common;
+ int64_t unused;
+
+ x->e_mbd.mi[0]->uv_mode = DC_PRED;
+ memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+ super_block_uvrd(cpi, x, rate_tokenonly, distortion, skippable, &unused,
+ bsize, INT64_MAX);
+ *rate =
+ *rate_tokenonly +
+ cpi->intra_uv_mode_cost[cm->frame_type][x->e_mbd.mi[0]->mode][DC_PRED];
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+ PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+ TX_SIZE max_tx_size, int *rate_uv,
+ int *rate_uv_tokenonly, int64_t *dist_uv,
+ int *skip_uv, PREDICTION_MODE *mode_uv) {
+ // Use an estimated rd for uv_intra based on DC_PRED if the
+ // appropriate speed flag is set.
+ if (cpi->sf.use_uv_intra_rd_estimate) {
+ rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+ // Else do a proper rd search for each possible transform size that may
+ // be considered in the main rd loop.
+ } else {
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, rate_uv, rate_uv_tokenonly, dist_uv,
+ skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+ max_tx_size);
+ }
+ *mode_uv = x->e_mbd.mi[0]->uv_mode;
+}
+
+static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
+ int mode_context) {
+ assert(is_inter_mode(mode));
+ return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+}
+
+static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ int i, PREDICTION_MODE mode, int_mv this_mv[2],
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+ int_mv seg_mvs[MAX_REF_FRAMES],
+ int_mv *best_ref_mv[2], const int *mvjcost,
+ int *mvcost[2]) {
+ MODE_INFO *const mi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ int thismvcost = 0;
+ int idx, idy;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type];
+ const int is_compound = has_second_ref(mi);
+
+ switch (mode) {
+ case NEWMV:
+ this_mv[0].as_int = seg_mvs[mi->ref_frame[0]].as_int;
+ thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ if (is_compound) {
+ this_mv[1].as_int = seg_mvs[mi->ref_frame[1]].as_int;
+ thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+ mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+ }
+ break;
+ case NEARMV:
+ case NEARESTMV:
+ this_mv[0].as_int = frame_mv[mode][mi->ref_frame[0]].as_int;
+ if (is_compound)
+ this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
+ break;
+ default:
+ assert(mode == ZEROMV);
+ this_mv[0].as_int = 0;
+ if (is_compound) this_mv[1].as_int = 0;
+ break;
+ }
+
+ mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
+ if (is_compound) mi->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
+
+ mi->bmi[i].as_mode = mode;
+
+ for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+ for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+ memmove(&mi->bmi[i + idy * 2 + idx], &mi->bmi[i], sizeof(mi->bmi[i]));
+
+ return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mi->ref_frame[0]]) +
+ thismvcost;
+}
+
+static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
+ int64_t best_yrd, int i, int *labelyrate,
+ int64_t *distortion, int64_t *sse,
+ ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+ int mi_row, int mi_col) {
+ int k;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ MODE_INFO *const mi = xd->mi[0];
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->sb_type, pd);
+ const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+ const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+ int idx, idy;
+
+ const uint8_t *const src =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ uint8_t *const dst =
+ &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+ int64_t thisdistortion = 0, thissse = 0;
+ int thisrate = 0, ref;
+ const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
+ const int is_compound = has_second_ref(mi);
+ const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+
+ assert(!x->skip_block);
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const int bw = b_width_log2_lookup[BLOCK_8X8];
+ const int h = 4 * (i >> bw);
+ const int w = 4 * (i & ((1 << bw) - 1));
+ const struct scale_factors *sf = &xd->block_refs[ref]->sf;
+ int y_stride = pd->pre[ref].stride;
+ uint8_t *pre = pd->pre[ref].buf + (h * pd->pre[ref].stride + w);
+
+ if (vp9_is_scaled(sf)) {
+ const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+ const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+ y_stride = xd->block_refs[ref]->buf->y_stride;
+ pre = xd->block_refs[ref]->buf->y_buffer;
+ pre += scaled_buffer_offset(x_start + w, y_start + h, y_stride, sf);
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(pre), y_stride, CONVERT_TO_SHORTPTR(dst),
+ pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2),
+ xd->bd);
+ } else {
+ vp9_build_inter_predictor(
+ pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2));
+ }
+#else
+ vp9_build_inter_predictor(
+ pre, y_stride, dst, pd->dst.stride, &mi->bmi[i].as_mv[ref].as_mv,
+ &xd->block_refs[ref]->sf, width, height, ref, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE + 4 * (i % 2), mi_row * MI_SIZE + 4 * (i / 2));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(
+ height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
+ } else {
+ vpx_subtract_block(height, width,
+ vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
+ }
+#else
+ vpx_subtract_block(height, width,
+ vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+ 8, src, p->src.stride, dst, pd->dst.stride);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ k = i;
+ for (idy = 0; idy < height / 4; ++idy) {
+ for (idx = 0; idx < width / 4; ++idx) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+#endif
+ int64_t ssz, rd, rd1, rd2;
+ tran_low_t *coeff, *qcoeff, *dqcoeff;
+ uint16_t *eob;
+ int coeff_ctx;
+ k += (idy * 2 + idx);
+ coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
+ coeff = BLOCK_OFFSET(p->coeff, k);
+ qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+ dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+ eob = &p->eobs[k];
+
+ x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+ coeff, 8);
+#if CONFIG_VP9_HIGHBITDEPTH
+ vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+ p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+ so->scan, so->iscan);
+ thisdistortion += vp9_highbd_block_error_dispatch(
+ coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
+#else
+ vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
+ qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
+ thisdistortion +=
+ vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ thissse += ssz;
+ thisrate += cost_coeffs(x, 0, k, TX_4X4, coeff_ctx, so->scan,
+ so->neighbors, cpi->sf.use_fast_coef_costing);
+ ta[k & 1] = tl[k >> 1] = (x->plane[0].eobs[k] > 0) ? 1 : 0;
+ rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
+ rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+ rd = VPXMIN(rd1, rd2);
+ if (rd >= best_yrd) return INT64_MAX;
+ }
+ }
+
+ *distortion = thisdistortion >> 2;
+ *labelyrate = thisrate;
+ *sse = thissse >> 2;
+
+ return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+typedef struct {
+ int eobs;
+ int brate;
+ int byrate;
+ int64_t bdist;
+ int64_t bsse;
+ int64_t brdcost;
+ int_mv mvs[2];
+ ENTROPY_CONTEXT ta[2];
+ ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
+ int_mv *ref_mv[2];
+ int_mv mvp;
+
+ int64_t segment_rd;
+ int r;
+ int64_t d;
+ int64_t sse;
+ int segment_yrate;
+ PREDICTION_MODE modes[4];
+ SEG_RDSTAT rdstat[4][INTER_MODES];
+ int mvthresh;
+} BEST_SEG_INFO;
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->row >> 3) < mv_limits->row_min ||
+ (mv->row >> 3) > mv_limits->row_max ||
+ (mv->col >> 3) < mv_limits->col_min ||
+ (mv->col >> 3) > mv_limits->col_max;
+}
+
+static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
+ MODE_INFO *const mi = x->e_mbd.mi[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
+
+ p->src.buf =
+ &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+ assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
+ pd->pre[0].buf =
+ &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[0].stride)];
+ if (has_second_ref(mi))
+ pd->pre[1].buf =
+ &pd->pre[1]
+ .buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->pre[1].stride)];
+}
+
+static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
+ struct buf_2d orig_pre[2]) {
+ MODE_INFO *mi = x->e_mbd.mi[0];
+ x->plane[0].src = orig_src;
+ x->e_mbd.plane[0].pre[0] = orig_pre[0];
+ if (has_second_ref(mi)) x->e_mbd.plane[0].pre[1] = orig_pre[1];
+}
+
+static INLINE int mv_has_subpel(const MV *mv) {
+ return (mv->row & 0x0F) || (mv->col & 0x0F);
+}
+
+// Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
+// TODO(aconverse): Find out if this is still productive then clean up or remove
+static int check_best_zero_mv(const VP9_COMP *cpi,
+ const uint8_t mode_context[MAX_REF_FRAMES],
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
+ int this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2]) {
+ if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+ frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+ (ref_frames[1] == NONE ||
+ frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
+ int rfc = mode_context[ref_frames[0]];
+ int c1 = cost_mv_ref(cpi, NEARMV, rfc);
+ int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
+ int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+
+ if (this_mode == NEARMV) {
+ if (c1 > c3) return 0;
+ } else if (this_mode == NEARESTMV) {
+ if (c2 > c3) return 0;
+ } else {
+ assert(this_mode == ZEROMV);
+ if (ref_frames[1] == NONE) {
+ if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
+ (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
+ return 0;
+ } else {
+ if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEARESTMV][ref_frames[1]].as_int == 0) ||
+ (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0 &&
+ frame_mv[NEARMV][ref_frames[1]].as_int == 0))
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+ if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+ int_mv cur_fullpel_mv, prev_fullpel_mv;
+ cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+ cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+ prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+ prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+ if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+ }
+ return 0;
+}
+
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+ int this_mode_rate, int mode_rate,
+ int mv_thresh) {
+ const int mv_diff =
+ abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+ if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+ return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+ int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+ int ref0, int this_mode_rate, int best_mode_index) {
+ MV this_mv = mode_mv[this_mode][ref0].as_mv;
+ const int mv_thresh = 3;
+
+ // Pruning is not applicable for NEARESTMV or NEWMV modes.
+ if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+ // Pruning is not done when reference frame of the mode is same as best
+ // reference so far.
+ if (best_mode_index > 0 &&
+ ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+ return 0;
+
+ // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+ if (compare_mv_mode_rate(
+ this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+ return 1;
+
+ // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+ if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+ return 1;
+
+ // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+ if (this_mode == NEARMV) return 0;
+ // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+ if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+ single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+ return 1;
+ return 0;
+}
+
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+ int num_iters = MAX_JOINT_MV_SEARCH_ITERS; // sf_level = 0
+ if (sf_level >= 2)
+ num_iters = 0;
+ else if (sf_level >= 1)
+ num_iters = bsize < BLOCK_8X8
+ ? 0
+ : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+ return num_iters;
+}
+
+static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int_mv *frame_mv, int mi_row, int mi_col,
+ int_mv single_newmv[MAX_REF_FRAMES],
+ int *rate_mv, int num_iters) {
+ const VP9_COMMON *const cm = &cpi->common;
+ const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+ const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ const int refs[2] = { mi->ref_frame[0],
+ mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
+ int_mv ref_mv[2];
+ int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
+ int ite, ref;
+ const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
+ struct scale_factors sf;
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ uint32_t last_besterr[2] = { UINT_MAX, UINT_MAX };
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]),
+ vp9_get_scaled_ref_frame(cpi, mi->ref_frame[1])
+ };
+
+// Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+ uint8_t *second_pred;
+#else
+ DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Check number of iterations do not exceed the max
+ assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+
+ if (scaled_ref_frame[ref]) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL);
+ }
+
+ frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+ iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
+ }
+
+// Since we have scaled the reference frames to match the size of the current
+// frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+ cm->height, cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height, cm->width,
+ cm->height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < num_iters; ite++) {
+ struct buf_2d ref_yv12[2];
+ uint32_t bestsme = UINT_MAX;
+ int sadpb = x->sadperbit16;
+ MV tmp_mv;
+ int search_range = 3;
+
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+
+ // Skip further iterations of search if in the previous iteration, the
+ // motion vector of the searched ref frame is unchanged, and the other ref
+ // frame's full-pixel mv is unchanged.
+ if (skip_iters(iter_mvs, ite, id)) break;
+
+ // Initialized here because of compiler problem in Visual Studio.
+ ref_yv12[0] = xd->plane[0].pre[0];
+ ref_yv12[1] = xd->plane[0].pre[1];
+
+// Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(ref_yv12[!id].buf), ref_yv12[!id].stride,
+ second_pred_alloc_16, pw, &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0,
+ kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
+ } else {
+ second_pred = (uint8_t *)second_pred_alloc_16;
+ vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+ second_pred, pw, &frame_mv[refs[!id]].as_mv,
+ &sf, pw, ph, 0, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+ }
+#else
+ vp9_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+ second_pred, pw, &frame_mv[refs[!id]].as_mv, &sf,
+ pw, ph, 0, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Do compound motion search on the current reference frame.
+ if (id) xd->plane[0].pre[0] = ref_yv12[id];
+ vp9_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
+
+ // Use the mv result from the single mode as mv predictor.
+ tmp_mv = frame_mv[refs[id]].as_mv;
+
+ tmp_mv.col >>= 3;
+ tmp_mv.row >>= 3;
+
+ // Small-range full-pixel motion search.
+ bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb, search_range,
+ &cpi->fn_ptr[bsize], &ref_mv[id].as_mv,
+ second_pred);
+ if (bestsme < UINT_MAX)
+ bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+ second_pred, &cpi->fn_ptr[bsize], 1);
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < UINT_MAX) {
+ uint32_t dis; /* TODO: use dis in distortion calculation later. */
+ uint32_t sse;
+ bestsme = cpi->find_fractional_mv_step(
+ x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], 0,
+ cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search);
+ }
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id) xd->plane[0].pre[0] = ref_yv12[0];
+
+ if (bestsme < last_besterr[id]) {
+ frame_mv[refs[id]].as_mv = tmp_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ if (ite < num_iters - 1) {
+ iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+ iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Restore the prediction frame pointers to their unscaled versions.
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+
+ *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+ &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+}
+
+static int64_t rd_pick_best_sub8x8_mode(
+ VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
+ int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
+ int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
+ int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
+ int filter_idx, int mi_row, int mi_col) {
+ int i;
+ BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ int mode_idx;
+ int k, br = 0, idx, idy;
+ int64_t bd = 0, block_sse = 0;
+ PREDICTION_MODE this_mode;
+ VP9_COMMON *cm = &cpi->common;
+ struct macroblock_plane *const p = &x->plane[0];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int label_count = 4;
+ int64_t this_segment_rd = 0;
+ int label_mv_thresh;
+ int segmentyrate = 0;
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+ const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+ const int pw = num_4x4_blocks_wide << 2;
+ const int ph = num_4x4_blocks_high << 2;
+ ENTROPY_CONTEXT t_above[2], t_left[2];
+ int subpelmv = 1, have_ref = 0;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int has_second_rf = has_second_ref(mi);
+ const int inter_mode_mask = sf->inter_mode_mask[bsize];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+ vp9_zero(*bsi);
+
+ bsi->segment_rd = best_rd_so_far;
+ bsi->ref_mv[0] = best_ref_mv;
+ bsi->ref_mv[1] = second_best_ref_mv;
+ bsi->mvp.as_int = best_ref_mv->as_int;
+ bsi->mvthresh = mvthresh;
+
+ for (i = 0; i < 4; i++) bsi->modes[i] = ZEROMV;
+
+ memcpy(t_above, pd->above_context, sizeof(t_above));
+ memcpy(t_left, pd->left_context, sizeof(t_left));
+
+ // 64 makes this threshold really big effectively
+ // making it so that we very rarely check mvs on
+ // segments. setting this to 1 would make mv thresh
+ // roughly equal to what it is for macroblocks
+ label_mv_thresh = 1 * bsi->mvthresh / label_count;
+
+ // Segmentation method overheads
+ for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+ for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+ // TODO(jingning,rbultje): rewrite the rate-distortion optimization
+ // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
+ int_mv mode_mv[MB_MODE_COUNT][2];
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ PREDICTION_MODE mode_selected = ZEROMV;
+ int64_t best_rd = INT64_MAX;
+ const int block = idy * 2 + idx;
+ int ref;
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
+ frame_mv[ZEROMV][frame].as_int = 0;
+ vp9_append_sub8x8_mvs_for_idx(
+ cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+ &frame_mv[NEARMV][frame], mbmi_ext->mode_context);
+ }
+
+ // search for the best motion vector on this segment
+ for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+ const struct buf_2d orig_src = x->plane[0].src;
+ struct buf_2d orig_pre[2];
+
+ mode_idx = INTER_OFFSET(this_mode);
+ bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
+ if (!(inter_mode_mask & (1 << this_mode))) continue;
+
+ if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
+ this_mode, mi->ref_frame))
+ continue;
+
+ memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+ memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+ sizeof(bsi->rdstat[block][mode_idx].ta));
+ memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+ sizeof(bsi->rdstat[block][mode_idx].tl));
+
+ // motion search for newmv (single predictor case only)
+ if (!has_second_rf && this_mode == NEWMV &&
+ seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
+ MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
+ int step_param = 0;
+ uint32_t bestsme = UINT_MAX;
+ int sadpb = x->sadperbit4;
+ MV mvp_full;
+ int max_mv;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ /* Is the best so far sufficiently good that we cant justify doing
+ * and new motion search. */
+ if (best_rd < label_mv_thresh) break;
+
+ if (cpi->oxcf.mode != BEST) {
+ // use previous block's result as next block's MV predictor.
+ if (block > 0) {
+ bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+ if (block == 2)
+ bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
+ }
+ }
+ if (block == 0)
+ max_mv = x->max_mv_context[mi->ref_frame[0]];
+ else
+ max_mv =
+ VPXMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
+
+ if (sf->mv.auto_mv_step_size && cm->show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and the best ref mvs of the current block for
+ // the given reference.
+ step_param =
+ (vp9_init_search_range(max_mv) + cpi->mv_step_param) / 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+ mvp_full.row = bsi->mvp.as_mv.row >> 3;
+ mvp_full.col = bsi->mvp.as_mv.col >> 3;
+
+ if (sf->adaptive_motion_search) {
+ if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX &&
+ x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) {
+ mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
+ mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+ }
+ step_param = VPXMAX(step_param, 8);
+ }
+
+ // adjust src pointer for this block
+ mi_buf_shift(x, block);
+
+ vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
+
+ bestsme = vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method,
+ sadpb,
+ sf->mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
+ &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1);
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < UINT_MAX) {
+ uint32_t distortion;
+ cpi->find_fractional_mv_step(
+ x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop,
+ sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &distortion,
+ &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph,
+ cpi->sf.use_accurate_subpel_search);
+
+ // save motion search result for use in compound prediction
+ seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
+ }
+
+ x->pred_mv[mi->ref_frame[0]] = *new_mv;
+
+ // restore src pointers
+ mi_buf_restore(x, orig_src, orig_pre);
+ }
+
+ if (has_second_rf) {
+ if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+ seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
+ continue;
+ }
+
+ if (has_second_rf && this_mode == NEWMV &&
+ mi->interp_filter == EIGHTTAP) {
+ // Decide number of joint motion search iterations
+ const int num_joint_search_iters = get_joint_search_iters(
+ cpi->sf.comp_inter_joint_search_iter_level, bsize);
+ // adjust src pointers
+ mi_buf_shift(x, block);
+ if (num_joint_search_iters) {
+ int rate_mv;
+ joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
+ mi_col, seg_mvs[block], &rate_mv,
+ num_joint_search_iters);
+ seg_mvs[block][mi->ref_frame[0]].as_int =
+ frame_mv[this_mode][mi->ref_frame[0]].as_int;
+ seg_mvs[block][mi->ref_frame[1]].as_int =
+ frame_mv[this_mode][mi->ref_frame[1]].as_int;
+ }
+ // restore src pointers
+ mi_buf_restore(x, orig_src, orig_pre);
+ }
+
+ bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+ cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+ seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ bsi->rdstat[block][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
+ mode_mv[this_mode][ref].as_int;
+ }
+
+ // Trap vectors that reach beyond the UMV borders
+ if (mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][0].as_mv) ||
+ (has_second_rf &&
+ mv_check_bounds(&x->mv_limits, &mode_mv[this_mode][1].as_mv)))
+ continue;
+
+ if (filter_idx > 0) {
+ BEST_SEG_INFO *ref_bsi = bsi_buf;
+ subpelmv = 0;
+ have_ref = 1;
+
+ for (ref = 0; ref < 1 + has_second_rf; ++ref) {
+ subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+ have_ref &= mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
+ }
+
+ if (filter_idx > 1 && !subpelmv && !have_ref) {
+ ref_bsi = bsi_buf + 1;
+ have_ref = 1;
+ for (ref = 0; ref < 1 + has_second_rf; ++ref)
+ have_ref &= mode_mv[this_mode][ref].as_int ==
+ ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
+ }
+
+ if (!subpelmv && have_ref &&
+ ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+ memcpy(&bsi->rdstat[block][mode_idx],
+ &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT));
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[block + 1][mode_idx].eobs =
+ ref_bsi->rdstat[block + 1][mode_idx].eobs;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[block + 2][mode_idx].eobs =
+ ref_bsi->rdstat[block + 2][mode_idx].eobs;
+
+ if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
+ mode_selected = this_mode;
+ best_rd = bsi->rdstat[block][mode_idx].brdcost;
+ }
+ continue;
+ }
+ }
+
+ bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+ cpi, x, bsi->segment_rd - this_segment_rd, block,
+ &bsi->rdstat[block][mode_idx].byrate,
+ &bsi->rdstat[block][mode_idx].bdist,
+ &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+ bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+ if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+ bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+ x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+ bsi->rdstat[block][mode_idx].brate +=
+ bsi->rdstat[block][mode_idx].byrate;
+ bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
+ }
+
+ if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
+ mode_selected = this_mode;
+ best_rd = bsi->rdstat[block][mode_idx].brdcost;
+ }
+ } /*for each 4x4 mode*/
+
+ if (best_rd == INT64_MAX) {
+ int iy, midx;
+ for (iy = block + 1; iy < 4; ++iy)
+ for (midx = 0; midx < INTER_MODES; ++midx)
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+ return INT64_MAX;
+ }
+
+ mode_idx = INTER_OFFSET(mode_selected);
+ memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+ memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
+
+ set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+ mode_mv[mode_selected], frame_mv, seg_mvs[block],
+ bsi->ref_mv, x->nmvjointcost, x->mvcost);
+
+ br += bsi->rdstat[block][mode_idx].brate;
+ bd += bsi->rdstat[block][mode_idx].bdist;
+ block_sse += bsi->rdstat[block][mode_idx].bsse;
+ segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+ this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
+
+ if (this_segment_rd > bsi->segment_rd) {
+ int iy, midx;
+ for (iy = block + 1; iy < 4; ++iy)
+ for (midx = 0; midx < INTER_MODES; ++midx)
+ bsi->rdstat[iy][midx].brdcost = INT64_MAX;
+ bsi->segment_rd = INT64_MAX;
+ return INT64_MAX;
+ }
+ }
+ } /* for each label */
+
+ bsi->r = br;
+ bsi->d = bd;
+ bsi->segment_yrate = segmentyrate;
+ bsi->segment_rd = this_segment_rd;
+ bsi->sse = block_sse;
+
+ // update the coding decisions
+ for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
+
+ if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
+ /* set it to the best */
+ for (i = 0; i < 4; i++) {
+ mode_idx = INTER_OFFSET(bsi->modes[i]);
+ mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
+ if (has_second_ref(mi))
+ mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
+ x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+ mi->bmi[i].as_mode = bsi->modes[i];
+ }
+
+ /*
+ * used to set mbmi->mv.as_int
+ */
+ *returntotrate = bsi->r;
+ *returndistortion = bsi->d;
+ *returnyrate = bsi->segment_yrate;
+ *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
+ *psse = bsi->sse;
+ mi->mode = bsi->modes[3];
+
+ return bsi->segment_rd;
+}
+
+static void estimate_ref_frame_costs(const VP9_COMMON *cm,
+ const MACROBLOCKD *xd, int segment_id,
+ unsigned int *ref_costs_single,
+ unsigned int *ref_costs_comp,
+ vpx_prob *comp_mode_p) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+ memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+ *comp_mode_p = 128;
+ } else {
+ vpx_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+ vpx_prob comp_inter_p = 128;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
+ *comp_mode_p = comp_inter_p;
+ } else {
+ *comp_mode_p = 128;
+ }
+
+ ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+
+ if (cm->reference_mode != COMPOUND_REFERENCE) {
+ vpx_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+ vpx_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+ unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ base_cost += vp9_cost_bit(comp_inter_p, 0);
+
+ ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+ ref_costs_single[ALTREF_FRAME] = base_cost;
+ ref_costs_single[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+ ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+ ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+ ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+ } else {
+ ref_costs_single[LAST_FRAME] = 512;
+ ref_costs_single[GOLDEN_FRAME] = 512;
+ ref_costs_single[ALTREF_FRAME] = 512;
+ }
+ if (cm->reference_mode != SINGLE_REFERENCE) {
+ vpx_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
+ unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ base_cost += vp9_cost_bit(comp_inter_p, 1);
+
+ ref_costs_comp[LAST_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 0);
+ ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
+ } else {
+ ref_costs_comp[LAST_FRAME] = 512;
+ ref_costs_comp[GOLDEN_FRAME] = 512;
+ }
+ }
+}
+
+static void store_coding_context(
+ MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int mode_index,
+ int64_t comp_pred_diff[REFERENCE_MODES],
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS], int skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->skip = x->skip;
+ ctx->skippable = skippable;
+ ctx->best_mode_index = mode_index;
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+ ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+ ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+ ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+
+ memcpy(ctx->best_filter_diff, best_filter_diff,
+ sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+}
+
+static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ MV_REFERENCE_FRAME ref_frame,
+ BLOCK_SIZE block_size, int mi_row, int mi_col,
+ int_mv frame_nearest_mv[MAX_REF_FRAMES],
+ int_mv frame_near_mv[MAX_REF_FRAMES],
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+ const VP9_COMMON *cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+ const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+ assert(yv12 != NULL);
+
+ // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+ // use the UV scaling factors.
+ vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
+
+ // Gets an initial list of candidate vectors from neighbours and orders them
+ vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
+ mbmi_ext->mode_context);
+
+ // Candidate refinement carried out at encoder and decoder
+ vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+ &frame_nearest_mv[ref_frame],
+ &frame_near_mv[ref_frame]);
+
+ // Further refinement that is encode side only to test the top few candidates
+ // in full and choose the best as the centre point for subsequent searches.
+ // The current implementation doesn't support scaling.
+ if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
+ vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ block_size);
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int ref_frame_to_gf_rf_idx(int ref_frame) {
+ if (ref_frame == GOLDEN_FRAME) {
+ return 0;
+ }
+ if (ref_frame == LAST_FRAME) {
+ return 1;
+ }
+ if (ref_frame == ALTREF_FRAME) {
+ return 2;
+ }
+ assert(0);
+ return -1;
+}
+#endif
+
+static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, int_mv *tmp_mv,
+ int *rate_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const VP9_COMMON *cm = &cpi->common;
+ MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
+ int step_param;
+ MV mvp_full;
+ int ref = mi->ref_frame[0];
+ MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ int cost_list[5];
+ const int best_predmv_idx = x->mv_best_ref_index[ref];
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ vp9_get_scaled_ref_frame(cpi, ref);
+ const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+ MV pred_mv[3];
+
+ int bestsme = INT_MAX;
+#if CONFIG_NON_GREEDY_MV
+ int gf_group_idx = cpi->twopass.gf_group.index;
+ int gf_rf_idx = ref_frame_to_gf_rf_idx(ref);
+ BLOCK_SIZE square_bsize = get_square_block_size(bsize);
+ int_mv nb_full_mvs[NB_MVS_NUM] = { 0 };
+ MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, gf_group_idx, gf_rf_idx, square_bsize);
+ const int nb_full_mv_num =
+ vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+ const int lambda = (pw * ph) / 4;
+ assert(pw * ph == lambda << 2);
+#else // CONFIG_NON_GREEDY_MV
+ int sadpb = x->sadperbit16;
+#endif // CONFIG_NON_GREEDY_MV
+
+ pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+ pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+ pred_mv[2] = x->pred_mv[ref];
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+
+ vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+ }
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+ if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+ // Take wtd average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param =
+ (vp9_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+ 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+ if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+ const int boffset =
+ 2 * (b_width_log2_lookup[BLOCK_64X64] -
+ VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+ step_param = VPXMAX(step_param, boffset);
+ }
+
+ if (cpi->sf.adaptive_motion_search) {
+ int bwl = b_width_log2_lookup[bsize];
+ int bhl = b_height_log2_lookup[bsize];
+ int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+ if (tlevel < 5) step_param += 2;
+
+ // prev_mv_sad is not setup for dynamically scaled frames.
+ if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+ if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+ x->pred_mv[ref].row = INT16_MAX;
+ x->pred_mv[ref].col = INT16_MAX;
+ tmp_mv->as_int = INVALID_MV;
+
+ if (scaled_ref_frame) {
+ int j;
+ for (j = 0; j < MAX_MB_PLANE; ++j)
+ xd->plane[j].pre[0] = backup_yv12[j];
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ // Note: MV limits are modified here. Always restore the original values
+ // after full-pixel motion search.
+ vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+ mvp_full = pred_mv[best_predmv_idx];
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+
+#if CONFIG_NON_GREEDY_MV
+ bestsme = vp9_full_pixel_diamond_new(cpi, x, bsize, &mvp_full, step_param,
+ lambda, 1, nb_full_mvs, nb_full_mv_num,
+ &tmp_mv->as_mv);
+#else // CONFIG_NON_GREEDY_MV
+ bestsme = vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+ cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+#endif // CONFIG_NON_GREEDY_MV
+
+ if (cpi->sf.enhanced_full_pixel_motion_search) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ int this_me;
+ MV this_mv;
+ int diff_row;
+ int diff_col;
+ int step;
+
+ if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue;
+ if (i == best_predmv_idx) continue;
+
+ diff_row = ((int)pred_mv[i].row -
+ pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >>
+ 3;
+ diff_col = ((int)pred_mv[i].col -
+ pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >>
+ 3;
+ if (diff_row == 0 && diff_col == 0) continue;
+ if (diff_row < 0) diff_row = -diff_row;
+ if (diff_col < 0) diff_col = -diff_col;
+ step = get_msb((diff_row + diff_col + 1) >> 1);
+ if (step <= 0) continue;
+
+ mvp_full = pred_mv[i];
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+#if CONFIG_NON_GREEDY_MV
+ this_me = vp9_full_pixel_diamond_new(
+ cpi, x, bsize, &mvp_full,
+ VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), lambda, 1, nb_full_mvs,
+ nb_full_mv_num, &this_mv);
+#else // CONFIG_NON_GREEDY_MV
+ this_me = vp9_full_pixel_search(
+ cpi, x, bsize, &mvp_full,
+ VPXMAX(step_param, MAX_MVSEARCH_STEPS - step),
+ cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list),
+ &ref_mv, &this_mv, INT_MAX, 1);
+#endif // CONFIG_NON_GREEDY_MV
+ if (this_me < bestsme) {
+ tmp_mv->as_mv = this_mv;
+ bestsme = this_me;
+ }
+ }
+ }
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (bestsme < INT_MAX) {
+ uint32_t dis; /* TODO: use dis in distortion calculation later. */
+ cpi->find_fractional_mv_step(
+ x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+ cpi->sf.use_accurate_subpel_search);
+ }
+ *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+
+ x->pred_mv[ref] = tmp_mv->as_mv;
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+ uint8_t *orig_dst[MAX_MB_PLANE],
+ int orig_dst_stride[MAX_MB_PLANE]) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = orig_dst[i];
+ xd->plane[i].dst.stride = orig_dst_stride[i];
+ }
+}
+
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(VP9_COMP *cpi, int this_mode, int_mv this_mv,
+ int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+#if CONFIG_NON_GREEDY_MV
+ (void)mode_mv;
+ (void)this_mv;
+ if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) {
+ const int gf_group_idx = cpi->twopass.gf_group.index;
+ const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame);
+ const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx];
+ const MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, gf_group_idx, gf_rf_idx, cpi->tpl_bsize);
+ const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize];
+ const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize];
+ const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h);
+ const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w);
+ const int mv_mode =
+ tpl_frame
+ .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col];
+ if (mv_mode == NEW_MV_MODE) {
+ int_mv tpl_new_mv =
+ vp9_motion_field_mi_get_mv(motion_field, tpl_mi_row, tpl_mi_col);
+ int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row);
+ int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col);
+ if (VPXMAX(row_diff, col_diff) <= 8) {
+ return 1;
+ } else {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
+ } else {
+ return 0;
+ }
+#else
+ (void)mi_row;
+ (void)mi_col;
+ (void)bsize;
+ return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
+ (this_mv.as_int != 0) &&
+ ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+ ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+ (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+#endif
+}
+
+static int64_t handle_inter_mode(
+ VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2,
+ int64_t *distortion, int *skippable, int *rate_y, int *rate_uv,
+ struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
+ int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
+ INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
+ int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+ int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+ int64_t filter_cache[], int best_mode_index) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MODE_INFO *mi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int is_comp_pred = has_second_ref(mi);
+ const int this_mode = mi->mode;
+ int_mv *frame_mv = mode_mv[this_mode];
+ int i;
+ int refs[2] = { mi->ref_frame[0],
+ (mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1]) };
+ int_mv cur_mv[2];
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
+ uint8_t *tmp_buf;
+#else
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ int pred_exists = 0;
+ int intpel_mv;
+ int64_t rd, tmp_rd, best_rd = INT64_MAX;
+ int best_needs_copy = 0;
+ uint8_t *orig_dst[MAX_MB_PLANE];
+ int orig_dst_stride[MAX_MB_PLANE];
+ int rs = 0;
+ INTERP_FILTER best_filter = SWITCHABLE;
+ uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
+ int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
+
+ const int bsl = mi_width_log2_lookup[bsize];
+ const int blk_parity = (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+ const int pred_filter_search =
+ (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
+
+ int skip_txfm_sb = 0;
+ int64_t skip_sse_sb = INT64_MAX;
+ int64_t distortion_y = 0, distortion_uv = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
+ } else {
+ tmp_buf = (uint8_t *)tmp_buf16;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (pred_filter_search) {
+ INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
+ if (xd->above_mi && is_inter_block(xd->above_mi))
+ af = xd->above_mi->interp_filter;
+ if (xd->left_mi && is_inter_block(xd->left_mi))
+ lf = xd->left_mi->interp_filter;
+
+ if ((this_mode != NEWMV) || (af == lf)) best_filter = af;
+ }
+
+ if (is_comp_pred) {
+ if (frame_mv[refs[0]].as_int == INVALID_MV ||
+ frame_mv[refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+
+ if (cpi->sf.adaptive_mode_search) {
+ if (single_filter[this_mode][refs[0]] ==
+ single_filter[this_mode][refs[1]])
+ best_filter = single_filter[this_mode][refs[0]];
+ }
+ }
+
+ if (this_mode == NEWMV) {
+ int rate_mv;
+ if (is_comp_pred) {
+ // Decide number of joint motion search iterations
+ const int num_joint_search_iters = get_joint_search_iters(
+ cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
+ // Initialize mv using single prediction mode result.
+ frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+ frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+ if (num_joint_search_iters) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, joint_motion_search_time);
+#endif
+ joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
+ single_newmv, &rate_mv, num_joint_search_iters);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, joint_motion_search_time);
+#endif
+ } else {
+ rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+ &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+ &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+ *rate2 += rate_mv;
+ } else {
+ int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, single_motion_search_time);
+#endif
+ single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, single_motion_search_time);
+#endif
+ if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
+
+ frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
+ tmp_mv.as_int;
+ single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+ // Estimate the rate implications of a new mv but discount this
+ // under certain circumstances where we want to help initiate a weak
+ // motion field, where the distortion gain for a single block may not
+ // be enough to overcome the cost of a new mv.
+ if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row,
+ mi_col, bsize)) {
+ *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ } else {
+ *rate2 += rate_mv;
+ }
+ }
+ }
+
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ cur_mv[i] = frame_mv[refs[i]];
+ // Clip "next_nearest" so that it does not extend to far out of image
+ if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
+
+ if (mv_check_bounds(&x->mv_limits, &cur_mv[i].as_mv)) return INT64_MAX;
+ mi->mv[i].as_int = cur_mv[i].as_int;
+ }
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ orig_dst[i] = xd->plane[i].dst.buf;
+ orig_dst_stride[i] = xd->plane[i].dst.stride;
+ }
+
+ // We don't include the cost of the second reference here, because there
+ // are only two options: Last/ARF or Golden/ARF; The second one is always
+ // known, which is ARF.
+ //
+ // Under some circumstances we discount the cost of new mv mode to encourage
+ // initiation of a motion field.
+ if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0],
+ mi_row, mi_col, bsize)) {
+ *rate2 +=
+ VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]),
+ cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]]));
+ } else {
+ *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+ }
+
+ if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+ single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+ // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+ // rate.
+ if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+ this_mode, refs[0], *rate2,
+ best_mode_index)) {
+ // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+ // are not early terminated. This ensures all single modes are not getting
+ // skipped when the speed feature is enabled.
+ assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+ single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
+ return INT64_MAX;
+ }
+ }
+ if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
+ mi->mode != NEARESTMV)
+ return INT64_MAX;
+
+ pred_exists = 0;
+ // Are all MVs integer pel for Y and UV
+ intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
+ if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, interp_filter_time);
+#endif
+ // Search for best switchable filter by checking the variance of
+ // pred error irrespective of whether the filter will be used
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
+
+ if (cm->interp_filter != BILINEAR) {
+ // Use cb pattern for filter eval when filter is not switchable
+ const int enable_interp_search =
+ (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+ ? blk_parity
+ : 1;
+ if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+ best_filter = EIGHTTAP;
+ } else if (best_filter == SWITCHABLE && enable_interp_search) {
+ int newbest;
+ int tmp_rate_sum = 0;
+ int64_t tmp_dist_sum = 0;
+
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ int j;
+ int64_t rs_rd;
+ int tmp_skip_sb = 0;
+ int64_t tmp_skip_sse = INT64_MAX;
+ const int enable_earlyterm =
+ cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+ int64_t filt_best_rd;
+
+ mi->interp_filter = i;
+ rs = vp9_get_switchable_rate(cpi, xd);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+
+ if (i > 0 && intpel_mv) {
+ rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
+ filter_cache[i] = rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+ if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
+ *mask_filter = VPXMAX(*mask_filter, rd);
+ } else {
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
+ if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << i))) {
+ rate_sum = INT_MAX;
+ dist_sum = INT64_MAX;
+ continue;
+ }
+
+ if ((cm->interp_filter == SWITCHABLE && (!i || best_needs_copy)) ||
+ (cm->interp_filter != SWITCHABLE &&
+ (cm->interp_filter == mi->interp_filter ||
+ (i == 0 && intpel_mv)))) {
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ } else {
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
+ xd->plane[j].dst.stride = 64;
+ }
+ }
+
+ filt_best_rd =
+ cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+ if (build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+ &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+ filt_best_rd)) {
+ filter_cache[i] = INT64_MAX;
+ continue;
+ }
+
+ rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
+ filter_cache[i] = rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+ if (cm->interp_filter == SWITCHABLE) rd += rs_rd;
+ *mask_filter = VPXMAX(*mask_filter, rd);
+
+ if (i == 0 && intpel_mv) {
+ tmp_rate_sum = rate_sum;
+ tmp_dist_sum = dist_sum;
+ }
+ }
+
+ if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ if (rd / 2 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ return INT64_MAX;
+ }
+ }
+ newbest = i == 0 || rd < best_rd;
+
+ if (newbest) {
+ best_rd = rd;
+ best_filter = mi->interp_filter;
+ if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
+ best_needs_copy = !best_needs_copy;
+ }
+
+ if ((cm->interp_filter == SWITCHABLE && newbest) ||
+ (cm->interp_filter != SWITCHABLE &&
+ cm->interp_filter == mi->interp_filter)) {
+ pred_exists = 1;
+ tmp_rd = best_rd;
+
+ skip_txfm_sb = tmp_skip_sb;
+ skip_sse_sb = tmp_skip_sse;
+ memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ memcpy(bsse, x->bsse, sizeof(bsse));
+ }
+ }
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ }
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, interp_filter_time);
+#endif
+ // Set the appropriate filter
+ mi->interp_filter =
+ cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
+ rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
+
+ if (pred_exists) {
+ if (best_needs_copy) {
+ // again temporarily set the buffers to local memory to prevent a memcpy
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
+ xd->plane[i].dst.stride = 64;
+ }
+ }
+ rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
+ } else {
+ int tmp_rate;
+ int64_t tmp_dist;
+ // Handles the special case when a filter that is not in the
+ // switchable list (ex. bilinear) is indicated at the frame level, or
+ // skip condition holds.
+ build_inter_pred_model_rd_earlyterm(
+ cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+ &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
+ rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+ memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+ memcpy(bsse, x->bsse, sizeof(bsse));
+ }
+
+ if (!is_comp_pred) single_filter[this_mode][refs[0]] = mi->interp_filter;
+
+ if (cpi->sf.adaptive_mode_search)
+ if (is_comp_pred)
+ if (single_skippable[this_mode][refs[0]] &&
+ single_skippable[this_mode][refs[1]])
+ memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm));
+
+ if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+ // if current pred_error modeled rd is substantially more than the best
+ // so far, do not bother doing full rd
+ if (rd / 2 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ return INT64_MAX;
+ }
+ }
+
+ if (cm->interp_filter == SWITCHABLE) *rate2 += rs;
+
+ memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+ memcpy(x->bsse, bsse, sizeof(bsse));
+
+ if (!skip_txfm_sb || xd->lossless) {
+ int skippable_y, skippable_uv;
+ int64_t sseuv = INT64_MAX;
+ int64_t rdcosty = INT64_MAX;
+
+ // Y cost and distortion
+ vp9_subtract_plane(x, bsize, 0);
+ super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
+ ref_best_rd, recon);
+
+ if (*rate_y == INT_MAX) {
+ *rate2 = INT_MAX;
+ *distortion = INT64_MAX;
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ return INT64_MAX;
+ }
+
+ *rate2 += *rate_y;
+ *distortion += distortion_y;
+
+ rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+ rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+ if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+ &sseuv, bsize, ref_best_rd - rdcosty)) {
+ *rate2 = INT_MAX;
+ *distortion = INT64_MAX;
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ return INT64_MAX;
+ }
+
+ *psse += sseuv;
+ *rate2 += *rate_uv;
+ *distortion += distortion_uv;
+ *skippable = skippable_y && skippable_uv;
+ } else {
+ x->skip = 1;
+ *disable_skip = 1;
+
+ // The cost of skip bit needs to be added.
+ *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+
+ *distortion = skip_sse_sb;
+ }
+
+ if (!is_comp_pred) single_skippable[this_mode][refs[0]] = *skippable;
+
+ restore_dst_buf(xd, orig_dst, orig_dst_stride);
+ return 0; // The rate-distortion cost will be re-calculated by caller.
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = xd->plane;
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ int y_skip = 0, uv_skip = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+ TX_SIZE max_uv_tx_size;
+ x->skip_encode = 0;
+ ctx->skip = 0;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE;
+ // Initialize interp_filter here so we do not have to check for inter block
+ // modes in get_pred_context_switchable_interp()
+ xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
+
+ if (bsize >= BLOCK_8X8) {
+ if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly, &dist_y,
+ &y_skip, bsize, best_rd) >= best_rd) {
+ rd_cost->rate = INT_MAX;
+ return;
+ }
+ } else {
+ y_skip = 0;
+ if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+ &dist_y, best_rd) >= best_rd) {
+ rd_cost->rate = INT_MAX;
+ return;
+ }
+ }
+ max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->tx_size]
+ [pd[1].subsampling_x][pd[1].subsampling_y];
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+ &uv_skip, VPXMAX(BLOCK_8X8, bsize), max_uv_tx_size);
+
+ if (y_skip && uv_skip) {
+ rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+ vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+ rd_cost->dist = dist_y + dist_uv;
+ } else {
+ rd_cost->rate =
+ rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+ rd_cost->dist = dist_y + dist_uv;
+ }
+
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+ rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+}
+
+#if !CONFIG_REALTIME_ONLY
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 250
+#define VAR_MULT 250
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 };
+
+static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int64_t *this_rd,
+ struct buf_2d *recon,
+ MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME second_ref_frame,
+ PREDICTION_MODE this_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int rec_variance;
+ unsigned int src_variance;
+ unsigned int src_rec_min;
+ unsigned int var_diff = 0;
+ unsigned int var_factor = 0;
+ unsigned int adj_max;
+ unsigned int low_var_thresh = LOW_VAR_THRESH;
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
+ vp9e_tune_content content_type = cpi->oxcf.content;
+
+ if (*this_rd == INT64_MAX) return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd);
+ src_variance =
+ vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+ src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+ }
+#else
+ rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+ src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Scale based on area in 8x8 blocks
+ rec_variance /= (bw * bh);
+ src_variance /= (bw * bh);
+
+ if (content_type == VP9E_CONTENT_FILM) {
+ if (cpi->oxcf.pass == 2) {
+ // Adjust low variance threshold based on estimated group noise enegry.
+ double noise_factor =
+ (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF;
+ low_var_thresh = (unsigned int)(low_var_thresh * noise_factor);
+
+ if (ref_frame == INTRA_FRAME) {
+ low_var_thresh *= 2;
+ if (this_mode == DC_PRED) low_var_thresh *= 5;
+ } else if (second_ref_frame > INTRA_FRAME) {
+ low_var_thresh *= 2;
+ }
+ }
+ } else {
+ low_var_thresh = LOW_VAR_THRESH / 2;
+ }
+
+ // Lower of source (raw per pixel value) and recon variance. Note that
+ // if the source per pixel is 0 then the recon value here will not be per
+ // pixel (see above) so will likely be much larger.
+ src_rec_min = VPXMIN(src_variance, rec_variance);
+
+ if (src_rec_min > low_var_thresh) return;
+
+ // We care more when the reconstruction has lower variance so give this case
+ // a stronger weighting.
+ var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2
+ : (rec_variance - src_variance) / 2;
+
+ adj_max = max_var_adjust[content_type];
+
+ var_factor =
+ (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance);
+ var_factor = VPXMIN(adj_max, var_factor);
+
+ if ((content_type == VP9E_CONTENT_FILM) &&
+ ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) {
+ var_factor *= 2;
+ }
+
+ *this_rd += (*this_rd * var_factor) / 100;
+
+ (void)xd;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+// Do we have an internal image edge (e.g. formatting bars).
+int vp9_internal_image_edge(VP9_COMP *cpi) {
+ return (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+ (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *twopass = &cpi->twopass;
+ vpx_clear_system_state();
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+ bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+ bottom_edge = VPXMAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ TWO_PASS *twopass = &cpi->twopass;
+ vpx_clear_system_state();
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+ right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+ right_edge = VPXMAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) {
+ return vp9_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
+ vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+}
+
+#if !CONFIG_REALTIME_ONLY
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+ VP9_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ RD_OPT *const rd_opt = &cpi->rd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const struct segmentation *const seg = &cm->seg;
+ PREDICTION_MODE this_mode;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ unsigned char segment_id = mi->segment_id;
+ int comp_pred, i, k;
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+ INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+ int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
+ int64_t best_rd = best_rd_so_far;
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ MODE_INFO best_mbmode;
+ int best_mode_skippable = 0;
+ int midx, best_mode_index = -1;
+ unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+ vpx_prob comp_mode_p;
+ int64_t best_intra_rd = INT64_MAX;
+ unsigned int best_pred_sse = UINT_MAX;
+ PREDICTION_MODE best_intra_mode = DC_PRED;
+ int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+ int64_t dist_uv[TX_SIZES];
+ int skip_uv[TX_SIZES];
+ PREDICTION_MODE mode_uv[TX_SIZES];
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
+ int best_skip2 = 0;
+ uint8_t ref_frame_skip_mask[2] = { 0, 1 };
+ uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+ int mode_skip_start = sf->mode_skip_start + 1;
+ const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+ int64_t mode_threshold[MAX_MODES];
+ int8_t *tile_mode_map = tile_data->mode_map[bsize];
+ int8_t mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid
+ // lock mechanism involved with reads from
+ // tile_mode_map
+ const int mode_search_skip_flags = sf->mode_search_skip_flags;
+ const int is_rect_partition =
+ num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize];
+ int64_t mask_filter = 0;
+ int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
+ struct buf_2d *recon;
+ struct buf_2d recon_buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]);
+ recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH
+ ? CONVERT_TO_BYTEPTR(recon16)
+ : (uint8_t *)recon16;
+#else
+ DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]);
+ recon_buf.buf = recon8;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ recon_buf.stride = 64;
+ recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0;
+
+ vp9_zero(best_mbmode);
+
+ x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = INT64_MAX;
+ for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX;
+ for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = 0; i < MB_MODE_COUNT; ++i) {
+ for (k = 0; k < MAX_REF_FRAMES; ++k) {
+ single_inter_filter[i][k] = SWITCHABLE;
+ single_skippable[i][k] = 0;
+ }
+ }
+
+ rd_cost->rate = INT_MAX;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+ !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
+ assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+ setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+ // Skip checking missing references in both single and compound reference
+ // modes. Note that a mode will be skipped if both reference frames
+ // are masked out.
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ } else if (sf->reference_masking) {
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
+ mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ break;
+ }
+ }
+ }
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+ if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+ if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (sf->alt_ref_search_fp) {
+ mode_skip_mask[ALTREF_FRAME] = 0;
+ ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME) & 0xff;
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ if (sf->alt_ref_search_fp)
+ if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+ if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+ mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+ if (sf->adaptive_mode_search) {
+ if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+ cpi->rc.frames_since_golden >= 3)
+ if (x->pred_mv_sad[GOLDEN_FRAME] > (x->pred_mv_sad[LAST_FRAME] << 1))
+ mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+ }
+
+ if (bsize > sf->max_intra_bsize) {
+ ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+ ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+ }
+
+ mode_skip_mask[INTRA_FRAME] |=
+ (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+ for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
+
+ for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+ mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
+
+ midx = sf->schedule_mode_search ? mode_skip_start : 0;
+
+ while (midx > 4) {
+ uint8_t end_pos = 0;
+ for (i = 5; i < midx; ++i) {
+ if (mode_threshold[tile_mode_map[i - 1]] >
+ mode_threshold[tile_mode_map[i]]) {
+ uint8_t tmp = tile_mode_map[i];
+ tile_mode_map[i] = tile_mode_map[i - 1];
+ tile_mode_map[i - 1] = tmp;
+ end_pos = i;
+ }
+ }
+ midx = end_pos;
+ }
+
+ memcpy(mode_map, tile_mode_map, sizeof(mode_map));
+
+ for (midx = 0; midx < MAX_MODES; ++midx) {
+ int mode_index = mode_map[midx];
+ int mode_excluded = 0;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int compmode_cost = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+ int64_t total_sse = INT64_MAX;
+ int early_term = 0;
+
+ this_mode = vp9_mode_order[mode_index].mode;
+ ref_frame = vp9_mode_order[mode_index].ref_frame[0];
+ second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
+
+ vp9_zero(x->sum_y_eobs);
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (!comp_pred && ref_frame != INTRA_FRAME &&
+ sf->prune_single_mode_based_on_mv_diff_mode_rate)
+ single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
+
+ if (is_rect_partition) {
+ if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
+ if (second_ref_frame > 0 &&
+ (ctx->skip_ref_frame_mask & (1 << second_ref_frame)))
+ continue;
+ }
+
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (midx == mode_skip_start && best_mode_index >= 0) {
+ switch (best_mbmode.ref_frame[0]) {
+ case INTRA_FRAME: break;
+ case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break;
+ case GOLDEN_FRAME:
+ ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+ break;
+ case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
+ case NONE:
+ case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
+ }
+ }
+
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+ continue;
+
+ if (mode_skip_mask[ref_frame] & (1 << this_mode)) continue;
+
+ // Test best rd so far against threshold for trying this mode.
+ if (best_mode_skippable && sf->schedule_mode_search)
+ mode_threshold[mode_index] <<= 1;
+
+ if (best_rd < mode_threshold[mode_index]) continue;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+ if (sf->motion_field_mode_search) {
+ const int mi_width = VPXMIN(num_8x8_blocks_wide_lookup[bsize],
+ tile_info->mi_col_end - mi_col);
+ const int mi_height = VPXMIN(num_8x8_blocks_high_lookup[bsize],
+ tile_info->mi_row_end - mi_row);
+ const int bsl = mi_width_log2_lookup[bsize];
+ int cb_partition_search_ctrl =
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+ MODE_INFO *ref_mi;
+ int const_motion = 1;
+ int skip_ref_frame = !cb_partition_search_ctrl;
+ MV_REFERENCE_FRAME rf = NONE;
+ int_mv ref_mv;
+ ref_mv.as_int = INVALID_MV;
+
+ if ((mi_row - 1) >= tile_info->mi_row_start) {
+ ref_mv = xd->mi[-xd->mi_stride]->mv[0];
+ rf = xd->mi[-xd->mi_stride]->ref_frame[0];
+ for (i = 0; i < mi_width; ++i) {
+ ref_mi = xd->mi[-xd->mi_stride + i];
+ const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+ (ref_frame == ref_mi->ref_frame[0]);
+ skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
+ }
+ }
+
+ if ((mi_col - 1) >= tile_info->mi_col_start) {
+ if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
+ if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+ for (i = 0; i < mi_height; ++i) {
+ ref_mi = xd->mi[i * xd->mi_stride - 1];
+ const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
+ (ref_frame == ref_mi->ref_frame[0]);
+ skip_ref_frame &= (rf == ref_mi->ref_frame[0]);
+ }
+ }
+
+ if (skip_ref_frame && this_mode != NEARESTMV && this_mode != NEWMV)
+ if (rf > INTRA_FRAME)
+ if (ref_frame != rf) continue;
+
+ if (const_motion)
+ if (this_mode == NEARMV || this_mode == ZEROMV) continue;
+ }
+
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) continue;
+
+ if (cm->ref_frame_sign_bias[ref_frame] ==
+ cm->ref_frame_sign_bias[second_ref_frame])
+ continue;
+
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+ continue;
+
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+ if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ best_mode_index >= 0 && best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+
+ mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+ } else {
+ if (ref_frame != INTRA_FRAME)
+ mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ if (sf->adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
+ continue;
+
+ if (this_mode != DC_PRED) {
+ // Disable intra modes other than DC_PRED for blocks with low variance
+ // Threshold for intra skipping based on source variance
+ // TODO(debargha): Specialize the threshold for super block sizes
+ const unsigned int skip_intra_var_thresh =
+ (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64;
+ if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+ x->source_variance < skip_intra_var_thresh)
+ continue;
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ if (best_mode_index >= 0 && best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(this_mode, best_intra_mode)) continue;
+ }
+ }
+ } else {
+ const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
+ if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, this_mode,
+ ref_frames))
+ continue;
+ }
+
+ mi->mode = this_mode;
+ mi->uv_mode = DC_PRED;
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = second_ref_frame;
+ // Evaluate all sub-pel filters irrespective of whether we can use
+ // them for this frame.
+ mi->interp_filter =
+ cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
+ mi->mv[0].as_int = mi->mv[1].as_int = 0;
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ TX_SIZE uv_tx;
+ struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, intra_mode_search_time);
+#endif
+ memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+ super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
+ best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, intra_mode_search_time);
+#endif
+ if (rate_y == INT_MAX) continue;
+
+ uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
+ [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, intra_mode_search_time);
+#endif
+ if (rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+ &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
+ &skip_uv[uv_tx], &mode_uv[uv_tx]);
+ }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, intra_mode_search_time);
+#endif
+ rate_uv = rate_uv_tokenonly[uv_tx];
+ distortion_uv = dist_uv[uv_tx];
+ skippable = skippable && skip_uv[uv_tx];
+ mi->uv_mode = mode_uv[uv_tx];
+
+ rate2 = rate_y + cpi->mbmode_cost[mi->mode] + rate_uv_intra[uv_tx];
+ if (this_mode != DC_PRED && this_mode != TM_PRED)
+ rate2 += intra_cost_penalty;
+ distortion2 = distortion_y + distortion_uv;
+ } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_inter_mode_time);
+#endif
+ this_rd = handle_inter_mode(
+ cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
+ recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
+ single_inter_filter, single_skippable,
+ &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+ filter_cache, best_mode_index);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_inter_mode_time);
+#endif
+ if (this_rd == INT64_MAX) continue;
+
+ compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+ }
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ if (comp_pred) {
+ rate2 += ref_costs_comp[ref_frame];
+ } else {
+ rate2 += ref_costs_single[ref_frame];
+ }
+
+ if (!disable_skip) {
+ const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+ const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+ const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
+ if (skippable) {
+ // Back out the coefficient coding costs
+ rate2 -= (rate_y + rate_uv);
+
+ // Cost the skip mb case
+ rate2 += skip_cost1;
+ } else if (ref_frame != INTRA_FRAME && !xd->lossless &&
+ !cpi->oxcf.sharpness) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0,
+ distortion2) <
+ RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
+ // Add in the cost of the no skip flag.
+ rate2 += skip_cost0;
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ assert(total_sse >= 0);
+
+ rate2 += skip_cost1;
+ distortion2 = total_sse;
+ rate2 -= (rate_y + rate_uv);
+ this_skip2 = 1;
+ }
+ } else {
+ // Add in the cost of the no skip flag.
+ rate2 += skip_cost0;
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+ if (recon) {
+ // In film mode bias against DC pred and other intra if there is a
+ // significant difference between the variance of the sub blocks in the
+ // the source. Also apply some bias against compound modes which also
+ // tend to blur fine texture such as film grain over time.
+ //
+ // The sub block test here acts in the case where one or more sub
+ // blocks have high relatively variance but others relatively low
+ // variance. Here the high variance sub blocks may push the
+ // total variance for the current block size over the thresholds
+ // used in rd_variance_adjustment() below.
+ if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+ if (bsize >= BLOCK_16X16) {
+ int min_energy, max_energy;
+ vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+ &max_energy);
+ if (max_energy > min_energy) {
+ if (ref_frame == INTRA_FRAME) {
+ if (this_mode == DC_PRED)
+ this_rd += (this_rd * (max_energy - min_energy));
+ else
+ this_rd += (this_rd * (max_energy - min_energy)) / 4;
+ } else if (second_ref_frame > INTRA_FRAME) {
+ this_rd += this_rd / 4;
+ }
+ }
+ }
+ }
+ // Apply an adjustment to the rd value based on the similarity of the
+ // source variance and reconstructed variance.
+ rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame,
+ second_ref_frame, this_mode);
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ // Keep record of best intra rd
+ if (this_rd < best_intra_rd) {
+ best_intra_rd = this_rd;
+ best_intra_mode = mi->mode;
+ }
+ }
+
+ if (!disable_skip && ref_frame == INTRA_FRAME) {
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ int max_plane = MAX_MB_PLANE;
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ best_mode_index = mode_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mi->mv[0].as_int = 0;
+ max_plane = 1;
+ // Initialize interp_filter here so we do not have to check for
+ // inter block modes in get_pred_context_switchable_interp()
+ mi->interp_filter = SWITCHABLE_FILTERS;
+ } else {
+ best_pred_sse = x->pred_sse[ref_frame];
+ }
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ best_mbmode = *mi;
+ best_skip2 = this_skip2;
+ best_mode_skippable = skippable;
+
+ if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+ memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mi->tx_size],
+ sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+ ctx->sum_y_eobs = x->sum_y_eobs[mi->tx_size];
+
+ // TODO(debargha): enhance this test with a better distortion prediction
+ // based on qp, activity mask and history
+ if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+ (mode_index > MIN_EARLY_TERM_INDEX)) {
+ int qstep = xd->plane[0].dequant[1];
+ // TODO(debargha): Enhance this by specializing for each mode_index
+ int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ qstep >>= (xd->bd - 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ if (x->source_variance < UINT_MAX) {
+ const int var_adjust = (x->source_variance < 16);
+ scale -= var_adjust;
+ }
+ if (ref_frame > INTRA_FRAME && distortion2 * scale < qstep * qstep) {
+ early_term = 1;
+ }
+ }
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (!comp_pred) {
+ if (single_rd < best_pred_rd[SINGLE_REFERENCE])
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < best_pred_rd[COMPOUND_REFERENCE])
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+
+ /* keep record of best filter type */
+ if (!mode_excluded && cm->interp_filter != BILINEAR) {
+ int64_t ref =
+ filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS
+ : cm->interp_filter];
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ int64_t adj_rd;
+ if (ref == INT64_MAX)
+ adj_rd = 0;
+ else if (filter_cache[i] == INT64_MAX)
+ // when early termination is triggered, the encoder does not have
+ // access to the rate-distortion cost. it only knows that the cost
+ // should be above the maximum valid value. hence it takes the known
+ // maximum plus an arbitrary constant as the rate-distortion cost.
+ adj_rd = mask_filter - ref + 10;
+ else
+ adj_rd = filter_cache[i] - ref;
+
+ adj_rd += this_rd;
+ best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+ }
+ }
+ }
+
+ if (early_term) break;
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ // The inter modes' rate costs are not calculated precisely in some cases.
+ // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
+ // ZEROMV. Here, checks are added for those cases, and the mode decisions
+ // are corrected.
+ if (best_mbmode.mode == NEWMV) {
+ const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
+ best_mbmode.ref_frame[1] };
+ int comp_pred_mode = refs[1] > INTRA_FRAME;
+
+ if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode &&
+ frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
+ !comp_pred_mode))
+ best_mbmode.mode = NEARESTMV;
+ else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+ ((comp_pred_mode &&
+ frame_mv[NEARMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
+ !comp_pred_mode))
+ best_mbmode.mode = NEARMV;
+ else if (best_mbmode.mv[0].as_int == 0 &&
+ ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) ||
+ !comp_pred_mode))
+ best_mbmode.mode = ZEROMV;
+ }
+
+ if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
+ // If adaptive interp filter is enabled, then the current leaf node of 8x8
+ // data is needed for sub8x8. Hence preserve the context.
+ if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ // If we used an estimate for the uv intra rd in the loop above...
+ if (sf->use_uv_intra_rd_estimate) {
+ // Do Intra UV best rd mode selection if best mode choice above was intra.
+ if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+ TX_SIZE uv_tx_size;
+ *mi = best_mbmode;
+ uv_tx_size = get_uv_tx_size(mi, &xd->plane[1]);
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
+ &rate_uv_tokenonly[uv_tx_size],
+ &dist_uv[uv_tx_size], &skip_uv[uv_tx_size],
+ bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+ uv_tx_size);
+ }
+ }
+
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter) ||
+ !is_inter_block(&best_mbmode));
+
+ if (!cpi->rc.is_src_frame_alt_ref)
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize, best_mode_index);
+
+ // macroblock modes
+ *mi = best_mbmode;
+ x->skip |= best_skip2;
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ if (!x->skip) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ if (best_filter_rd[i] == INT64_MAX)
+ best_filter_diff[i] = 0;
+ else
+ best_filter_diff[i] = best_rd - best_filter_rd[i];
+ }
+ if (cm->interp_filter == SWITCHABLE)
+ assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+ } else {
+ vp9_zero(best_filter_diff);
+ }
+
+ // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
+ // updating code causes PSNR loss. Need to figure out the confliction.
+ x->skip |= best_mode_skippable;
+
+ if (!x->skip && !x->select_tx_size) {
+ int has_high_freq_coeff = 0;
+ int plane;
+ int max_plane = is_inter_block(xd->mi[0]) ? MAX_MB_PLANE : 1;
+ for (plane = 0; plane < max_plane; ++plane) {
+ x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
+ has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+ }
+
+ for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
+ x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
+ has_high_freq_coeff |= vp9_has_high_freq_in_plane(x, bsize, plane);
+ }
+
+ best_mode_skippable |= !has_high_freq_coeff;
+ }
+
+ assert(best_mode_index >= 0);
+
+ store_coding_context(x, ctx, best_mode_index, best_pred_diff,
+ best_filter_diff, best_mode_skippable);
+}
+
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, RD_COST *rd_cost,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ unsigned char segment_id = mi->segment_id;
+ const int comp_pred = 0;
+ int i;
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+ vpx_prob comp_mode_p;
+ INTERP_FILTER best_filter = SWITCHABLE;
+ int64_t this_rd = INT64_MAX;
+ int rate2 = 0;
+ const int64_t distortion2 = 0;
+
+ x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+ mi->mode = ZEROMV;
+ mi->uv_mode = DC_PRED;
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE;
+ mi->mv[0].as_int = 0;
+ x->skip = 1;
+
+ ctx->sum_y_eobs = 0;
+
+ if (cm->interp_filter != BILINEAR) {
+ best_filter = EIGHTTAP;
+ if (cm->interp_filter == SWITCHABLE &&
+ x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+ int rs;
+ int best_rs = INT_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ mi->interp_filter = i;
+ rs = vp9_get_switchable_rate(cpi, xd);
+ if (rs < best_rs) {
+ best_rs = rs;
+ best_filter = mi->interp_filter;
+ }
+ }
+ }
+ }
+ // Set the appropriate filter
+ if (cm->interp_filter == SWITCHABLE) {
+ mi->interp_filter = best_filter;
+ rate2 += vp9_get_switchable_rate(cpi, xd);
+ } else {
+ mi->interp_filter = cm->interp_filter;
+ }
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ rate2 += vp9_cost_bit(comp_mode_p, comp_pred);
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs_single[LAST_FRAME];
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+
+ if (this_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == mi->interp_filter));
+
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+ cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+
+ vp9_zero(best_pred_diff);
+ vp9_zero(best_filter_diff);
+
+ if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
+ store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, best_filter_diff, 0);
+}
+
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ VP9_COMMON *const cm = &cpi->common;
+ RD_OPT *const rd_opt = &cpi->rd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ MV_REFERENCE_FRAME ref_frame, second_ref_frame;
+ unsigned char segment_id = mi->segment_id;
+ int comp_pred, i;
+ int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+ struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ int64_t best_rd = best_rd_so_far;
+ int64_t best_yrd = best_rd_so_far; // FIXME(rbultje) more precise
+ int64_t best_pred_diff[REFERENCE_MODES];
+ int64_t best_pred_rd[REFERENCE_MODES];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ MODE_INFO best_mbmode;
+ int ref_index, best_ref_index = 0;
+ unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+ vpx_prob comp_mode_p;
+ INTERP_FILTER tmp_best_filter = SWITCHABLE;
+ int rate_uv_intra, rate_uv_tokenonly;
+ int64_t dist_uv;
+ int skip_uv;
+ PREDICTION_MODE mode_uv = DC_PRED;
+ const int intra_cost_penalty =
+ vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
+ int_mv seg_mvs[4][MAX_REF_FRAMES];
+ b_mode_info best_bmodes[4];
+ int best_skip2 = 0;
+ int ref_frame_skip_mask[2] = { 0 };
+ int64_t mask_filter = 0;
+ int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+ int internal_active_edge =
+ vp9_active_edge_sb(cpi, mi_row, mi_col) && vp9_internal_image_edge(cpi);
+ const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+
+ x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+ memset(x->zcoeff_blk[TX_4X4], 0, 4);
+ vp9_zero(best_mbmode);
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
+
+ for (i = 0; i < 4; i++) {
+ int j;
+ for (j = 0; j < MAX_REF_FRAMES; j++) seg_mvs[i][j].as_int = INVALID_MV;
+ }
+
+ estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
+ &comp_mode_p);
+
+ for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = INT64_MAX;
+ rate_uv_intra = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+ setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
+ } else {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ }
+ frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+ frame_mv[ZEROMV][ref_frame].as_int = 0;
+ }
+
+ for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
+ int mode_excluded = 0;
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int compmode_cost = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+ int64_t total_sse = INT_MAX;
+ int early_term = 0;
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+
+ ref_frame = vp9_ref_order[ref_index].ref_frame[0];
+ second_ref_frame = vp9_ref_order[ref_index].ref_frame[1];
+
+ vp9_zero(x->sum_y_eobs);
+
+#if CONFIG_BETTER_HW_COMPATIBILITY
+ // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
+ if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
+ int ref_scaled = ref_frame > INTRA_FRAME &&
+ vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+ if (second_ref_frame > INTRA_FRAME)
+ ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
+ if (ref_scaled) continue;
+ }
+#endif
+ // Look at the reference frame of the best mode so far and set the
+ // skip mask to look at a subset of the remaining modes.
+ if (ref_index > 2 && sf->mode_skip_start < MAX_MODES) {
+ if (ref_index == 3) {
+ switch (best_mbmode.ref_frame[0]) {
+ case INTRA_FRAME: break;
+ case LAST_FRAME:
+ ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+ case GOLDEN_FRAME:
+ ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ break;
+ case ALTREF_FRAME:
+ ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
+ break;
+ case NONE:
+ case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
+ }
+ }
+ }
+
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+ (ref_frame_skip_mask[1] & (1 << VPXMAX(0, second_ref_frame))))
+ continue;
+
+ // Test best rd so far against threshold for trying this mode.
+ if (!internal_active_edge &&
+ rd_less_than_thresh(best_rd,
+ rd_opt->threshes[segment_id][bsize][ref_index],
+ &rd_thresh_freq_fact[ref_index]))
+ continue;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test && ref_frame == INTRA_FRAME) continue;
+
+ comp_pred = second_ref_frame > INTRA_FRAME;
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) continue;
+
+ if (cm->ref_frame_sign_bias[ref_frame] ==
+ cm->ref_frame_sign_bias[second_ref_frame])
+ continue;
+
+ if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+ continue;
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
+
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+ }
+
+ if (comp_pred)
+ mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+ else if (ref_frame != INTRA_FRAME)
+ mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
+
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ continue;
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ } else if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
+ continue;
+ }
+
+ mi->tx_size = TX_4X4;
+ mi->uv_mode = DC_PRED;
+ mi->ref_frame[0] = ref_frame;
+ mi->ref_frame[1] = second_ref_frame;
+ // Evaluate all sub-pel filters irrespective of whether we can use
+ // them for this frame.
+ mi->interp_filter =
+ cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Select prediction reference frames.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ int rate;
+ if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y,
+ best_rd) >= best_rd)
+ continue;
+ rate2 += rate;
+ rate2 += intra_cost_penalty;
+ distortion2 += distortion_y;
+
+ if (rate_uv_intra == INT_MAX) {
+ choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra,
+ &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv);
+ }
+ rate2 += rate_uv_intra;
+ rate_uv = rate_uv_tokenonly;
+ distortion2 += dist_uv;
+ distortion_uv = dist_uv;
+ mi->uv_mode = mode_uv;
+ } else {
+ int rate;
+ int64_t distortion;
+ int64_t this_rd_thresh;
+ int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
+ int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
+ int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
+ int tmp_best_skippable = 0;
+ int switchable_filter_index;
+ int_mv *second_ref =
+ comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
+ b_mode_info tmp_best_bmodes[16];
+ MODE_INFO tmp_best_mbmode;
+ BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+ int pred_exists = 0;
+ int uv_skippable;
+
+ YV12_BUFFER_CONFIG *scaled_ref_frame[2] = { NULL, NULL };
+ int ref;
+
+ for (ref = 0; ref < 2; ++ref) {
+ scaled_ref_frame[ref] =
+ mi->ref_frame[ref] > INTRA_FRAME
+ ? vp9_get_scaled_ref_frame(cpi, mi->ref_frame[ref])
+ : NULL;
+
+ if (scaled_ref_frame[ref]) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL);
+ }
+ }
+
+ this_rd_thresh = (ref_frame == LAST_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_LAST]
+ : rd_opt->threshes[segment_id][bsize][THR_ALTR];
+ this_rd_thresh = (ref_frame == GOLDEN_FRAME)
+ ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
+ : this_rd_thresh;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ filter_cache[i] = INT64_MAX;
+
+ if (cm->interp_filter != BILINEAR) {
+ tmp_best_filter = EIGHTTAP;
+ if (x->source_variance < sf->disable_filter_search_var_thresh) {
+ tmp_best_filter = EIGHTTAP;
+ } else if (sf->adaptive_pred_interp_filter == 1 &&
+ ctx->pred_interp_filter < SWITCHABLE) {
+ tmp_best_filter = ctx->pred_interp_filter;
+ } else if (sf->adaptive_pred_interp_filter == 2) {
+ tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE
+ ? ctx->pred_interp_filter
+ : 0;
+ } else {
+ for (switchable_filter_index = 0;
+ switchable_filter_index < SWITCHABLE_FILTERS;
+ ++switchable_filter_index) {
+ int newbest, rs;
+ int64_t rs_rd;
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+ mi->interp_filter = switchable_filter_index;
+ tmp_rd = rd_pick_best_sub8x8_mode(
+ cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+ &rate, &rate_y, &distortion, &skippable, &total_sse,
+ (int)this_rd_thresh, seg_mvs, bsi, switchable_filter_index,
+ mi_row, mi_col);
+
+ if (tmp_rd == INT64_MAX) continue;
+ rs = vp9_get_switchable_rate(cpi, xd);
+ rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+ filter_cache[switchable_filter_index] = tmp_rd;
+ filter_cache[SWITCHABLE_FILTERS] =
+ VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
+ if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd;
+
+ mask_filter = VPXMAX(mask_filter, tmp_rd);
+
+ newbest = (tmp_rd < tmp_best_rd);
+ if (newbest) {
+ tmp_best_filter = mi->interp_filter;
+ tmp_best_rd = tmp_rd;
+ }
+ if ((newbest && cm->interp_filter == SWITCHABLE) ||
+ (mi->interp_filter == cm->interp_filter &&
+ cm->interp_filter != SWITCHABLE)) {
+ tmp_best_rdu = tmp_rd;
+ tmp_best_rate = rate;
+ tmp_best_ratey = rate_y;
+ tmp_best_distortion = distortion;
+ tmp_best_sse = total_sse;
+ tmp_best_skippable = skippable;
+ tmp_best_mbmode = *mi;
+ x->sum_y_eobs[TX_4X4] = 0;
+ for (i = 0; i < 4; i++) {
+ tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
+ x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+ x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
+ }
+ pred_exists = 1;
+ if (switchable_filter_index == 0 && sf->use_rd_breakout &&
+ best_rd < INT64_MAX) {
+ if (tmp_best_rdu / 2 > best_rd) {
+ // skip searching the other filters if the first is
+ // already substantially larger than the best so far
+ tmp_best_filter = mi->interp_filter;
+ tmp_best_rdu = INT64_MAX;
+ break;
+ }
+ }
+ }
+ } // switchable_filter_index loop
+ }
+ }
+
+ if (tmp_best_rdu == INT64_MAX && pred_exists) continue;
+
+ mi->interp_filter = (cm->interp_filter == SWITCHABLE ? tmp_best_filter
+ : cm->interp_filter);
+ if (!pred_exists) {
+ // Handles the special case when a filter that is not in the
+ // switchable list (bilinear, 6-tap) is indicated at the frame level
+ tmp_rd = rd_pick_best_sub8x8_mode(
+ cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
+ &rate, &rate_y, &distortion, &skippable, &total_sse,
+ (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col);
+ if (tmp_rd == INT64_MAX) continue;
+ x->sum_y_eobs[TX_4X4] = 0;
+ for (i = 0; i < 4; i++) {
+ x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
+ x->sum_y_eobs[TX_4X4] += x->plane[0].eobs[i];
+ }
+ } else {
+ total_sse = tmp_best_sse;
+ rate = tmp_best_rate;
+ rate_y = tmp_best_ratey;
+ distortion = tmp_best_distortion;
+ skippable = tmp_best_skippable;
+ *mi = tmp_best_mbmode;
+ for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
+ }
+
+ rate2 += rate;
+ distortion2 += distortion;
+
+ if (cm->interp_filter == SWITCHABLE)
+ rate2 += vp9_get_switchable_rate(cpi, xd);
+
+ if (!mode_excluded)
+ mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
+ : cm->reference_mode == COMPOUND_REFERENCE;
+
+ compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
+
+ tmp_best_rdu =
+ best_rd - VPXMIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
+ RDCOST(x->rdmult, x->rddiv, 0, total_sse));
+
+ if (tmp_best_rdu > 0) {
+ // If even the 'Y' rd value of split is higher than best so far
+ // then dont bother looking at UV
+ vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
+ memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+ if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+ &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+ }
+ continue;
+ }
+
+ rate2 += rate_uv;
+ distortion2 += distortion_uv;
+ skippable = skippable && uv_skippable;
+ total_sse += uv_sse;
+ }
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Restore the prediction frame pointers to their unscaled versions.
+ for (i = 0; i < MAX_MB_PLANE; ++i)
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+ }
+ }
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ if (second_ref_frame > INTRA_FRAME) {
+ rate2 += ref_costs_comp[ref_frame];
+ } else {
+ rate2 += ref_costs_single[ref_frame];
+ }
+
+ if (!disable_skip) {
+ const vpx_prob skip_prob = vp9_get_skip_prob(cm, xd);
+ const int skip_cost0 = vp9_cost_bit(skip_prob, 0);
+ const int skip_cost1 = vp9_cost_bit(skip_prob, 1);
+
+ // Skip is never coded at the segment level for sub8x8 blocks and instead
+ // always coded in the bitstream at the mode info level.
+ if (ref_frame != INTRA_FRAME && !xd->lossless) {
+ if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0,
+ distortion2) <
+ RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
+ // Add in the cost of the no skip flag.
+ rate2 += skip_cost0;
+ } else {
+ // FIXME(rbultje) make this work for splitmv also
+ rate2 += skip_cost1;
+ distortion2 = total_sse;
+ assert(total_sse >= 0);
+ rate2 -= (rate_y + rate_uv);
+ rate_y = 0;
+ rate_uv = 0;
+ this_skip2 = 1;
+ }
+ } else {
+ // Add in the cost of the no skip flag.
+ rate2 += skip_cost0;
+ }
+
+ // Calculate the final RD estimate for this mode.
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+ }
+
+ if (!disable_skip && ref_frame == INTRA_FRAME) {
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < best_rd || x->skip) {
+ if (!mode_excluded) {
+ int max_plane = MAX_MB_PLANE;
+ // Note index of best mode so far
+ best_ref_index = ref_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mi->mv[0].as_int = 0;
+ max_plane = 1;
+ // Initialize interp_filter here so we do not have to check for
+ // inter block modes in get_pred_context_switchable_interp()
+ mi->interp_filter = SWITCHABLE_FILTERS;
+ }
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ best_rd = this_rd;
+ best_yrd =
+ best_rd - RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
+ best_mbmode = *mi;
+ best_skip2 = this_skip2;
+ if (!x->select_tx_size) swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+ memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+ sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+ ctx->sum_y_eobs = x->sum_y_eobs[TX_4X4];
+
+ for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
+
+ // TODO(debargha): enhance this test with a better distortion prediction
+ // based on qp, activity mask and history
+ if ((sf->mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
+ (ref_index > MIN_EARLY_TERM_INDEX)) {
+ int qstep = xd->plane[0].dequant[1];
+ // TODO(debargha): Enhance this by specializing for each mode_index
+ int scale = 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ qstep >>= (xd->bd - 8);
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ if (x->source_variance < UINT_MAX) {
+ const int var_adjust = (x->source_variance < 16);
+ scale -= var_adjust;
+ }
+ if (ref_frame > INTRA_FRAME && distortion2 * scale < qstep * qstep) {
+ early_term = 1;
+ }
+ }
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+ if (!comp_pred && single_rd < best_pred_rd[SINGLE_REFERENCE])
+ best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ else if (comp_pred && single_rd < best_pred_rd[COMPOUND_REFERENCE])
+ best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+
+ if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+ best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+ }
+
+ /* keep record of best filter type */
+ if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
+ cm->interp_filter != BILINEAR) {
+ int64_t ref =
+ filter_cache[cm->interp_filter == SWITCHABLE ? SWITCHABLE_FILTERS
+ : cm->interp_filter];
+ int64_t adj_rd;
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ if (ref == INT64_MAX)
+ adj_rd = 0;
+ else if (filter_cache[i] == INT64_MAX)
+ // when early termination is triggered, the encoder does not have
+ // access to the rate-distortion cost. it only knows that the cost
+ // should be above the maximum valid value. hence it takes the known
+ // maximum plus an arbitrary constant as the rate-distortion cost.
+ adj_rd = mask_filter - ref + 10;
+ else
+ adj_rd = filter_cache[i] - ref;
+
+ adj_rd += this_rd;
+ best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
+ }
+ }
+
+ if (early_term) break;
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ if (best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ // If we used an estimate for the uv intra rd in the loop above...
+ if (sf->use_uv_intra_rd_estimate) {
+ // Do Intra UV best rd mode selection if best mode choice above was intra.
+ if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+ *mi = best_mbmode;
+ rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra, &rate_uv_tokenonly,
+ &dist_uv, &skip_uv, BLOCK_8X8, TX_4X4);
+ }
+ }
+
+ if (best_rd == INT64_MAX) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->dist = INT64_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter == best_mbmode.interp_filter) ||
+ !is_inter_block(&best_mbmode));
+
+ vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh,
+ bsize, best_ref_index);
+
+ // macroblock modes
+ *mi = best_mbmode;
+ x->skip |= best_skip2;
+ if (!is_inter_block(&best_mbmode)) {
+ for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
+ } else {
+ for (i = 0; i < 4; ++i)
+ memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
+
+ mi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+ mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+ }
+ // If the second reference does not exist, set the corresponding mv to zero.
+ if (mi->ref_frame[1] == NONE) {
+ mi->mv[1].as_int = 0;
+ for (i = 0; i < 4; ++i) {
+ mi->bmi[i].as_mv[1].as_int = 0;
+ }
+ }
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (best_pred_rd[i] == INT64_MAX)
+ best_pred_diff[i] = INT_MIN;
+ else
+ best_pred_diff[i] = best_rd - best_pred_rd[i];
+ }
+
+ if (!x->skip) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+ if (best_filter_rd[i] == INT64_MAX)
+ best_filter_diff[i] = 0;
+ else
+ best_filter_diff[i] = best_rd - best_filter_rd[i];
+ }
+ if (cm->interp_filter == SWITCHABLE)
+ assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
+ } else {
+ vp9_zero(best_filter_diff);
+ }
+
+ store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff,
+ 0);
+}
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h
new file mode 100644
index 0000000000..e1147ff943
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_
+#define VPX_VP9_ENCODER_VP9_RDOPT_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_context_tree.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct TileInfo;
+struct VP9_COMP;
+struct macroblock;
+struct RD_COST;
+
+void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
+ struct RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+
+#if !CONFIG_REALTIME_ONLY
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void vp9_rd_pick_inter_mode_sb_seg_skip(
+ struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x,
+ struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+#endif
+
+int vp9_internal_image_edge(struct VP9_COMP *cpi);
+int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
+int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
+int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
+
+#if !CONFIG_REALTIME_ONLY
+void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_COST *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_RDOPT_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.c b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c
new file mode 100644
index 0000000000..7486dee25b
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.c
@@ -0,0 +1,826 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_dsp/vpx_dsp_common.h"
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_resize.h"
+
+#define FILTER_BITS 7
+
+#define INTERP_TAPS 8
+#define SUBPEL_BITS 5
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define INTERP_PRECISION_BITS 32
+
+typedef int16_t interp_kernel[INTERP_TAPS];
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters500[(1 << SUBPEL_BITS)] = {
+ { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, -1, 34, 64, 36, 1, -3, 0 },
+ { -3, -1, 32, 64, 38, 1, -3, 0 }, { -2, -2, 31, 63, 39, 2, -3, 0 },
+ { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 28, 63, 42, 3, -4, 0 },
+ { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 },
+ { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 },
+ { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 50, 8, -4, -1 },
+ { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 17, 58, 52, 10, -4, 0 },
+ { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 56, 54, 13, -4, -1 },
+ { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 54, 56, 15, -4, -1 },
+ { -1, -4, 12, 53, 57, 16, -4, -1 }, { 0, -4, 10, 52, 58, 17, -4, -1 },
+ { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 8, 50, 60, 20, -4, -1 },
+ { 0, -4, 7, 49, 60, 21, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 },
+ { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 },
+ { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 42, 63, 28, -2, -2 },
+ { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 39, 63, 31, -2, -2 },
+ { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 36, 64, 34, -1, -3 }
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters625[(1 << SUBPEL_BITS)] = {
+ { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 30, 80, 35, -8, -1, 1 },
+ { -1, -8, 28, 80, 37, -7, -2, 1 }, { 0, -8, 26, 79, 39, -7, -2, 1 },
+ { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 22, 78, 43, -6, -2, 1 },
+ { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 18, 77, 48, -5, -3, 1 },
+ { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 75, 52, -3, -4, 1 },
+ { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 11, 73, 56, -2, -4, 1 },
+ { 0, -7, 10, 71, 58, -1, -4, 1 }, { 1, -7, 8, 70, 60, 0, -5, 1 },
+ { 1, -6, 6, 68, 62, 1, -5, 1 }, { 1, -6, 5, 67, 63, 2, -5, 1 },
+ { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -5, 2, 63, 67, 5, -6, 1 },
+ { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 0, 60, 70, 8, -7, 1 },
+ { 1, -4, -1, 58, 71, 10, -7, 0 }, { 1, -4, -2, 56, 73, 11, -7, 0 },
+ { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 52, 75, 15, -8, 0 },
+ { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 48, 77, 18, -8, 0 },
+ { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 43, 78, 22, -8, 0 },
+ { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 39, 79, 26, -8, 0 },
+ { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -1, -8, 35, 80, 30, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters750[(1 << SUBPEL_BITS)] = {
+ { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 22, 96, 28, -11, 2, 0 },
+ { 2, -10, 19, 95, 31, -11, 2, 0 }, { 2, -10, 17, 95, 34, -12, 2, 0 },
+ { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -8, 12, 93, 40, -12, 1, 0 },
+ { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -7, 7, 91, 46, -12, 1, 0 },
+ { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 3, 88, 52, -12, 0, 1 },
+ { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, -1, 84, 58, -11, 0, 1 },
+ { 2, -4, -2, 82, 61, -11, -1, 1 }, { 2, -4, -4, 80, 64, -10, -1, 1 },
+ { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 75, 70, -8, -2, 1 },
+ { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 70, 75, -6, -3, 1 },
+ { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 64, 80, -4, -4, 2 },
+ { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 58, 84, -1, -5, 2 },
+ { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 52, 88, 3, -6, 2 },
+ { 0, 1, -12, 49, 90, 5, -7, 2 }, { 0, 1, -12, 46, 91, 7, -7, 2 },
+ { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 40, 93, 12, -8, 2 },
+ { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 34, 95, 17, -10, 2 },
+ { 0, 2, -11, 31, 95, 19, -10, 2 }, { 0, 2, -11, 28, 96, 22, -11, 2 }
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const interp_kernel filteredinterp_filters875[(1 << SUBPEL_BITS)] = {
+ { 3, -8, 13, 112, 13, -8, 3, 0 }, { 3, -7, 10, 112, 17, -9, 3, -1 },
+ { 2, -6, 7, 111, 21, -9, 3, -1 }, { 2, -5, 4, 111, 24, -10, 3, -1 },
+ { 2, -4, 1, 110, 28, -11, 3, -1 }, { 1, -3, -1, 108, 32, -12, 4, -1 },
+ { 1, -2, -3, 106, 36, -13, 4, -1 }, { 1, -1, -6, 105, 40, -14, 4, -1 },
+ { 1, -1, -7, 102, 44, -14, 4, -1 }, { 1, 0, -9, 100, 48, -15, 4, -1 },
+ { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -12, 95, 57, -16, 4, -1 },
+ { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 88, 65, -16, 4, -1 },
+ { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 81, 73, -16, 3, 0 },
+ { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 73, 81, -16, 3, 0 },
+ { 0, 4, -17, 69, 84, -15, 3, 0 }, { -1, 4, -16, 65, 88, -14, 2, 0 },
+ { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 57, 95, -12, 1, 0 },
+ { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 48, 100, -9, 0, 1 },
+ { -1, 4, -14, 44, 102, -7, -1, 1 }, { -1, 4, -14, 40, 105, -6, -1, 1 },
+ { -1, 4, -13, 36, 106, -3, -2, 1 }, { -1, 4, -12, 32, 108, -1, -3, 1 },
+ { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -10, 24, 111, 4, -5, 2 },
+ { -1, 3, -9, 21, 111, 7, -6, 2 }, { -1, 3, -9, 17, 112, 10, -7, 3 }
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+static const interp_kernel filteredinterp_filters1000[(1 << SUBPEL_BITS)] = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 1, -3, 128, 3, -1, 0, 0 },
+ { -1, 2, -6, 127, 7, -2, 1, 0 }, { -1, 3, -9, 126, 12, -4, 1, 0 },
+ { -1, 4, -12, 125, 16, -5, 1, 0 }, { -1, 4, -14, 123, 20, -6, 2, 0 },
+ { -1, 5, -15, 120, 25, -8, 2, 0 }, { -1, 5, -17, 118, 30, -9, 3, -1 },
+ { -1, 6, -18, 114, 35, -10, 3, -1 }, { -1, 6, -19, 111, 41, -12, 3, -1 },
+ { -1, 6, -20, 107, 46, -13, 4, -1 }, { -1, 6, -21, 103, 52, -14, 4, -1 },
+ { -1, 6, -21, 99, 57, -16, 5, -1 }, { -1, 6, -21, 94, 63, -17, 5, -1 },
+ { -1, 6, -20, 89, 68, -18, 5, -1 }, { -1, 6, -20, 84, 73, -19, 6, -1 },
+ { -1, 6, -20, 79, 79, -20, 6, -1 }, { -1, 6, -19, 73, 84, -20, 6, -1 },
+ { -1, 5, -18, 68, 89, -20, 6, -1 }, { -1, 5, -17, 63, 94, -21, 6, -1 },
+ { -1, 5, -16, 57, 99, -21, 6, -1 }, { -1, 4, -14, 52, 103, -21, 6, -1 },
+ { -1, 4, -13, 46, 107, -20, 6, -1 }, { -1, 3, -12, 41, 111, -19, 6, -1 },
+ { -1, 3, -10, 35, 114, -18, 6, -1 }, { -1, 3, -9, 30, 118, -17, 5, -1 },
+ { 0, 2, -8, 25, 120, -15, 5, -1 }, { 0, 2, -6, 20, 123, -14, 4, -1 },
+ { 0, 1, -5, 16, 125, -12, 4, -1 }, { 0, 1, -4, 12, 126, -9, 3, -1 },
+ { 0, 1, -2, 7, 127, -6, 2, -1 }, { 0, 0, -1, 3, 128, -3, 1, 0 }
+};
+
+// Filters for factor of 2 downsampling.
+static const int16_t vp9_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t vp9_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const interp_kernel *choose_interp_filter(int inlength, int outlength) {
+ int outlength16 = outlength * 16;
+ if (outlength16 >= inlength * 16)
+ return filteredinterp_filters1000;
+ else if (outlength16 >= inlength * 13)
+ return filteredinterp_filters875;
+ else if (outlength16 >= inlength * 11)
+ return filteredinterp_filters750;
+ else if (outlength16 >= inlength * 9)
+ return filteredinterp_filters625;
+ else
+ return filteredinterp_filters500;
+}
+
+static void interpolate(const uint8_t *const input, int inlength,
+ uint8_t *output, int outlength) {
+ const int64_t delta =
+ (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+ const int64_t offset =
+ inlength > outlength
+ ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) /
+ outlength
+ : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) /
+ outlength;
+ uint8_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int64_t y;
+
+ const interp_kernel *interp_filters =
+ choose_interp_filter(inlength, outlength);
+
+ x = 0;
+ y = offset;
+ while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = outlength - 1;
+ y = delta * x + offset;
+ while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >=
+ inlength) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k) {
+ const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+ sum += filter[k] *
+ input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+ }
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset; x < x1; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
+ ? 0
+ : int_pel - INTERP_TAPS / 2 + 1 + k)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // End part.
+ for (; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength
+ ? inlength - 1
+ : int_pel - INTERP_TAPS / 2 + 1 + k)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ }
+}
+
+static void down2_symeven(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half.
+ const int16_t *filter = vp9_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+
+static void down2_symodd(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ const int16_t *filter = vp9_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+
+static int get_down2_length(int length, int steps) {
+ int s;
+ for (s = 0; s < steps; ++s) length = (length + 1) >> 1;
+ return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+ int steps = 0;
+ int proj_in_length;
+ while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+ ++steps;
+ in_length = proj_in_length;
+ }
+ return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+ uint8_t *output, int olength, uint8_t *otmp) {
+ int steps;
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ int s;
+ uint8_t *out = NULL;
+ uint8_t *otmp2;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ otmp2 = otmp + get_down2_length(length, 1);
+ for (s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint8_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ down2_symodd(in, filteredlength, out);
+ else
+ down2_symeven(in, filteredlength, out);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ interpolate(out, filteredlength, output, olength);
+ }
+ } else {
+ interpolate(input, length, output, olength);
+ }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+void vp9_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride) {
+ int i;
+ uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf));
+ uint8_t *tmpbuf =
+ (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf));
+ uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf));
+ uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2));
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ goto Error;
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ for (i = 0; i < height; ++i)
+ resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+ tmpbuf);
+ for (i = 0; i < width2; ++i) {
+ fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+ fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+ }
+
+Error:
+ free(intbuf);
+ free(tmpbuf);
+ free(arrbuf);
+ free(arrbuf2);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_interpolate(const uint16_t *const input, int inlength,
+ uint16_t *output, int outlength, int bd) {
+ const int64_t delta =
+ (((uint64_t)inlength << 32) + outlength / 2) / outlength;
+ const int64_t offset =
+ inlength > outlength
+ ? (((int64_t)(inlength - outlength) << 31) + outlength / 2) /
+ outlength
+ : -(((int64_t)(outlength - inlength) << 31) + outlength / 2) /
+ outlength;
+ uint16_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int64_t y;
+
+ const interp_kernel *interp_filters =
+ choose_interp_filter(inlength, outlength);
+
+ x = 0;
+ y = offset;
+ while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = outlength - 1;
+ y = delta * x + offset;
+ while ((y >> INTERP_PRECISION_BITS) + (int64_t)(INTERP_TAPS / 2) >=
+ inlength) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k) {
+ const int pk = int_pel - INTERP_TAPS / 2 + 1 + k;
+ sum += filter[k] *
+ input[(pk < 0 ? 0 : (pk >= inlength ? inlength - 1 : pk))];
+ }
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset; x < x1; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k) {
+ assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength);
+ sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
+ ? 0
+ : int_pel - INTERP_TAPS / 2 + 1 + k)];
+ }
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // End part.
+ for (; x < outlength; ++x, y += delta) {
+ const int16_t *filter;
+ int_pel = y >> INTERP_PRECISION_BITS;
+ sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
+ filter = interp_filters[sub_pel];
+ sum = 0;
+ for (k = 0; k < INTERP_TAPS; ++k)
+ sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= inlength
+ ? inlength - 1
+ : int_pel - INTERP_TAPS / 2 + 1 + k)];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ }
+}
+
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half.
+ static const int16_t *filter = vp9_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] +
+ input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ static const int16_t *filter = vp9_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+ uint16_t *output, int olength,
+ uint16_t *otmp, int bd) {
+ int steps;
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ int s;
+ uint16_t *out = NULL;
+ uint16_t *otmp2;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ otmp2 = otmp + get_down2_length(length, 1);
+ for (s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint16_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ highbd_down2_symodd(in, filteredlength, out, bd);
+ else
+ highbd_down2_symeven(in, filteredlength, out, bd);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ highbd_interpolate(out, filteredlength, output, olength, bd);
+ }
+ } else {
+ highbd_interpolate(input, length, output, olength, bd);
+ }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd) {
+ int i;
+ uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
+ uint16_t *tmpbuf =
+ (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width));
+ uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height);
+ uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ goto Error;
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ for (i = 0; i < height; ++i) {
+ highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+ intbuf + width2 * i, width2, tmpbuf, bd);
+ }
+ for (i = 0; i < width2; ++i) {
+ highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+ highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+ arrbuf2);
+ }
+
+Error:
+ free(intbuf);
+ free(tmpbuf);
+ free(arrbuf);
+ free(arrbuf2);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ vp9_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride);
+ vp9_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride);
+}
+
+void vp9_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ vp9_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+ ouv_stride);
+ vp9_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+ ouv_stride);
+}
+
+void vp9_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ vp9_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ vp9_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride);
+ vp9_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ vp9_highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ vp9_highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight,
+ owidth / 2, ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight,
+ owidth / 2, ouv_stride, bd);
+}
+
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ vp9_highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ vp9_highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride, bd);
+ vp9_highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_resize.h b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h
new file mode 100644
index 0000000000..5d4ce97eba
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_resize.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_
+#define VPX_VP9_ENCODER_VP9_RESIZE_H_
+
+#include <stdio.h>
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride);
+void vp9_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+void vp9_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+void vp9_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd);
+void vp9_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void vp9_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_RESIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c
new file mode 100644
index 0000000000..d75488a8e6
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.c
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_segmentation.h"
+
+void vp9_enable_segmentation(struct segmentation *seg) {
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
+}
+
+void vp9_disable_segmentation(struct segmentation *seg) {
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+}
+
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
+ unsigned char abs_delta) {
+ seg->abs_delta = abs_delta;
+
+ memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
+}
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] &= ~(1u << feature_id);
+}
+
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_data[segment_id][feature_id] = 0;
+}
+
+void vp9_psnr_aq_mode_setup(struct segmentation *seg) {
+ int i;
+
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2)));
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+}
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+ struct segmentation *seg) {
+ const VP9_COMMON *cm = &cpi->common;
+ const int seg_counts = cpi->kmeans_ctr_num;
+ const int base_qindex = cm->base_qindex;
+ const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth);
+ const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2];
+ const double var_diff_scale = 4.0;
+ int i;
+
+ assert(seg_counts <= MAX_SEGMENTS);
+
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ for (i = 0; i < seg_counts / 2; ++i) {
+ double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i];
+ double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale);
+ int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+ assert(wiener_var_diff >= 0.0);
+
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+
+ for (; i < seg_counts; ++i) {
+ double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr;
+ double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale);
+ int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+ assert(wiener_var_diff >= 0.0);
+
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+}
+
+// Based on set of segment counts calculate a probability tree
+static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
+ // Work out probabilities of each segment
+ const int c01 = segcounts[0] + segcounts[1];
+ const int c23 = segcounts[2] + segcounts[3];
+ const int c45 = segcounts[4] + segcounts[5];
+ const int c67 = segcounts[6] + segcounts[7];
+
+ segment_tree_probs[0] = get_binary_prob(c01 + c23, c45 + c67);
+ segment_tree_probs[1] = get_binary_prob(c01, c23);
+ segment_tree_probs[2] = get_binary_prob(c45, c67);
+ segment_tree_probs[3] = get_binary_prob(segcounts[0], segcounts[1]);
+ segment_tree_probs[4] = get_binary_prob(segcounts[2], segcounts[3]);
+ segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
+ segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
+}
+
+// Based on set of segment counts and probabilities calculate a cost estimate
+static int cost_segmap(int *segcounts, vpx_prob *probs) {
+ const int c01 = segcounts[0] + segcounts[1];
+ const int c23 = segcounts[2] + segcounts[3];
+ const int c45 = segcounts[4] + segcounts[5];
+ const int c67 = segcounts[6] + segcounts[7];
+ const int c0123 = c01 + c23;
+ const int c4567 = c45 + c67;
+
+ // Cost the top node of the tree
+ int cost = c0123 * vp9_cost_zero(probs[0]) + c4567 * vp9_cost_one(probs[0]);
+
+ // Cost subsequent levels
+ if (c0123 > 0) {
+ cost += c01 * vp9_cost_zero(probs[1]) + c23 * vp9_cost_one(probs[1]);
+
+ if (c01 > 0)
+ cost += segcounts[0] * vp9_cost_zero(probs[3]) +
+ segcounts[1] * vp9_cost_one(probs[3]);
+ if (c23 > 0)
+ cost += segcounts[2] * vp9_cost_zero(probs[4]) +
+ segcounts[3] * vp9_cost_one(probs[4]);
+ }
+
+ if (c4567 > 0) {
+ cost += c45 * vp9_cost_zero(probs[2]) + c67 * vp9_cost_one(probs[2]);
+
+ if (c45 > 0)
+ cost += segcounts[4] * vp9_cost_zero(probs[5]) +
+ segcounts[5] * vp9_cost_one(probs[5]);
+ if (c67 > 0)
+ cost += segcounts[6] * vp9_cost_zero(probs[6]) +
+ segcounts[7] * vp9_cost_one(probs[6]);
+ }
+
+ return cost;
+}
+
+static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MODE_INFO **mi,
+ int *no_pred_segcounts,
+ int (*temporal_predictor_count)[2],
+ int *t_unpred_seg_counts, int bw, int bh, int mi_row,
+ int mi_col) {
+ int segment_id;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ xd->mi = mi;
+ segment_id = xd->mi[0]->segment_id;
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ // Count the number of hits on each segment with no prediction
+ no_pred_segcounts[segment_id]++;
+
+ // Temporal prediction not allowed on key frames
+ if (cm->frame_type != KEY_FRAME) {
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ // Test to see if the segment id matches the predicted value.
+ const int pred_segment_id =
+ get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col);
+ const int pred_flag = pred_segment_id == segment_id;
+ const int pred_context = vp9_get_pred_context_seg_id(xd);
+
+ // Store the prediction status for this mb and update counts
+ // as appropriate
+ xd->mi[0]->seg_id_predicted = pred_flag;
+ temporal_predictor_count[pred_context][pred_flag]++;
+
+ // Update the "unpredicted" segment count
+ if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+ }
+}
+
+static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MODE_INFO **mi,
+ int *no_pred_segcounts,
+ int (*temporal_predictor_count)[2],
+ int *t_unpred_seg_counts, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int mis = cm->mi_stride;
+ int bw, bh;
+ const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ bw = num_8x8_blocks_wide_lookup[mi[0]->sb_type];
+ bh = num_8x8_blocks_high_lookup[mi[0]->sb_type];
+
+ if (bw == bs && bh == bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+ } else if (bw == bs && bh < bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+ mi_row + hbs, mi_col);
+ } else if (bw < bs && bh == bs) {
+ count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+ t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+ count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+ mi_col + hbs);
+ } else {
+ const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+ int n;
+
+ assert(bw < bs && bh < bs);
+
+ for (n = 0; n < 4; n++) {
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
+
+ count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
+ }
+ }
+}
+
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
+ struct segmentation *seg = &cm->seg;
+
+ int no_pred_cost;
+ int t_pred_cost = INT_MAX;
+
+ int i, tile_col, mi_row, mi_col;
+
+ int temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
+ int no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+ int t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+
+ vpx_prob no_pred_tree[SEG_TREE_PROBS];
+ vpx_prob t_pred_tree[SEG_TREE_PROBS];
+ vpx_prob t_nopred_prob[PREDICTION_PROBS];
+
+ // Set default state for the segment tree probabilities and the
+ // temporal coding probabilities
+ memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+ memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
+
+ // First of all generate stats regarding how well the last segment map
+ // predicts this one
+ for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
+ TileInfo tile;
+ MODE_INFO **mi_ptr;
+ vp9_tile_init(&tile, cm, 0, tile_col);
+
+ mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
+ for (mi_row = 0; mi_row < cm->mi_rows;
+ mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
+ MODE_INFO **mi = mi_ptr;
+ for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
+ mi_col += 8, mi += 8)
+ count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, mi_row,
+ mi_col, BLOCK_64X64);
+ }
+ }
+
+ // Work out probability tree for coding segments without prediction
+ // and the cost.
+ calc_segtree_probs(no_pred_segcounts, no_pred_tree);
+ no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
+
+ // Key frames cannot use temporal prediction
+ if (!frame_is_intra_only(cm)) {
+ // Work out probability tree for coding those segments not
+ // predicted using the temporal method and the cost.
+ calc_segtree_probs(t_unpred_seg_counts, t_pred_tree);
+ t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
+
+ // Add in the cost of the signaling for each prediction context.
+ for (i = 0; i < PREDICTION_PROBS; i++) {
+ const int count0 = temporal_predictor_count[i][0];
+ const int count1 = temporal_predictor_count[i][1];
+
+ t_nopred_prob[i] = get_binary_prob(count0, count1);
+
+ // Add in the predictor signaling cost
+ t_pred_cost += count0 * vp9_cost_zero(t_nopred_prob[i]) +
+ count1 * vp9_cost_one(t_nopred_prob[i]);
+ }
+ }
+
+ // Now choose which coding method to use.
+ if (t_pred_cost < no_pred_cost) {
+ seg->temporal_update = 1;
+ memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+ memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+ } else {
+ seg->temporal_update = 0;
+ memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+ }
+}
+
+void vp9_reset_segment_features(struct segmentation *seg) {
+ // Set up default state for MB feature flags
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+ vp9_clearall_segfeatures(seg);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h
new file mode 100644
index 0000000000..9404c38bc8
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_segmentation.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
+#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_enable_segmentation(struct segmentation *seg);
+void vp9_disable_segmentation(struct segmentation *seg);
+
+void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+void vp9_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void vp9_psnr_aq_mode_setup(struct segmentation *seg);
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+ struct segmentation *seg);
+
+// The values given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
+ unsigned char abs_delta);
+
+void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd);
+
+void vp9_reset_segment_features(struct segmentation *seg);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 0000000000..cc6c967767
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv, int bsize,
+ int consec_zeromv, int curr_motion_magn) {
+ // No skin if block has been zero/small motion for long consecutive time.
+ if (consec_zeromv > 60 && curr_motion_magn == 0) {
+ return 0;
+ } else {
+ int motion = 1;
+ // Take center pixel in block to determine is_skin.
+ const int y_width_shift = (4 << b_width_log2_lookup[bsize]) >> 1;
+ const int y_height_shift = (4 << b_height_log2_lookup[bsize]) >> 1;
+ const int uv_width_shift = y_width_shift >> 1;
+ const int uv_height_shift = y_height_shift >> 1;
+ const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
+ const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
+ const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+
+ if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+ return vpx_skin_pixel(ysource, usource, vsource, motion);
+ }
+}
+
+void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ int i, j, num_bl;
+ VP9_COMMON *const cm = &cpi->common;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+ const int y_bsize = 4 << b_width_log2_lookup[bsize];
+ const int uv_bsize = y_bsize >> 1;
+ const int shy = (y_bsize == 8) ? 3 : 4;
+ const int shuv = shy - 1;
+ const int fac = y_bsize / 8;
+ const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3);
+ const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2);
+ const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2);
+ const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2);
+ src_y += y_shift;
+ src_u += uv_shift;
+ src_v += uv_shift;
+
+ for (i = mi_row; i < mi_row_limit; i += fac) {
+ num_bl = 0;
+ for (j = mi_col; j < mi_col_limit; j += fac) {
+ int consec_zeromv = 0;
+ int bl_index = i * cm->mi_cols + j;
+ int bl_index1 = bl_index + 1;
+ int bl_index2 = bl_index + cm->mi_cols;
+ int bl_index3 = bl_index2 + 1;
+ // Don't detect skin on the boundary.
+ if (i == 0 || j == 0) continue;
+ if (bsize == BLOCK_8X8)
+ consec_zeromv = cpi->consec_zero_mv[bl_index];
+ else
+ consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+ VPXMIN(cpi->consec_zero_mv[bl_index1],
+ VPXMIN(cpi->consec_zero_mv[bl_index2],
+ cpi->consec_zero_mv[bl_index3])));
+ cpi->skin_map[bl_index] =
+ vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+ bsize, consec_zeromv, 0);
+ num_bl++;
+ src_y += y_bsize;
+ src_u += uv_bsize;
+ src_v += uv_bsize;
+ }
+ src_y += (src_ystride << shy) - (num_bl << shy);
+ src_u += (src_uvstride << shuv) - (num_bl << shuv);
+ src_v += (src_uvstride << shuv) - (num_bl << shuv);
+ }
+
+ // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+ // non-skin blocks (all of its neighbors are skin).
+ // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin
+ // blocks. Skip superblock borders to remove isolated non-skin blocks.
+ for (i = mi_row; i < mi_row_limit; i += fac) {
+ for (j = mi_col; j < mi_col_limit; j += fac) {
+ int bl_index = i * cm->mi_cols + j;
+ int num_neighbor = 0;
+ int mi, mj;
+ int non_skin_threshold = 8;
+ // Skip 4 corners.
+ if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) ||
+ (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac)))
+ continue;
+ // There are only 5 neighbors for non-skin blocks on the border.
+ if (i == mi_row || i == mi_row_limit - fac || j == mi_col ||
+ j == mi_col_limit - fac)
+ non_skin_threshold = 5;
+
+ for (mi = -fac; mi <= fac; mi += fac) {
+ for (mj = -fac; mj <= fac; mj += fac) {
+ if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col &&
+ j + mj < mi_col_limit) {
+ int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj;
+ if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+ }
+ }
+ }
+
+ if (cpi->skin_map[bl_index] && num_neighbor < 2)
+ cpi->skin_map[bl_index] = 0;
+ if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+ cpi->skin_map[bl_index] = 1;
+ }
+ }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+ int i, j, mi_row, mi_col, num_bl;
+ VP9_COMMON *const cm = &cpi->common;
+ uint8_t *y;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int y_bsize = 16; // Use 8x8 or 16x16.
+ const int shy = (y_bsize == 8) ? 3 : 4;
+ const int fac = y_bsize / 8;
+
+ YV12_BUFFER_CONFIG skinmap;
+ memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+ if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x,
+ cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment)) {
+ vpx_free_frame_buffer(&skinmap);
+ return;
+ }
+ memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+ y = skinmap.y_buffer;
+ // Loop through blocks and set skin map based on center pixel of block.
+ // Set y to white for skin block, otherwise set to source with gray scale.
+ // Ignore rightmost/bottom boundary blocks.
+ for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
+ num_bl = 0;
+ for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int is_skin = cpi->skin_map[block_index];
+ for (i = 0; i < y_bsize; i++) {
+ for (j = 0; j < y_bsize; j++) {
+ y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+ }
+ }
+ num_bl++;
+ y += y_bsize;
+ src_y += y_bsize;
+ }
+ y += (src_ystride << shy) - (num_bl << shy);
+ src_y += (src_ystride << shy) - (num_bl << shy);
+ }
+ vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+ vpx_free_frame_buffer(&skinmap);
+}
+#endif
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 0000000000..46a722af9b
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
+#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
+
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv, int bsize,
+ int consec_zeromv, int curr_motion_magn);
+
+void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
new file mode 100644
index 0000000000..48c21c581e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
@@ -0,0 +1,1088 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_speed_features.h"
+#include "vp9/encoder/vp9_rdopt.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Mesh search patters for various speed settings
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+ { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+};
+
+#if !CONFIG_REALTIME_ONLY
+// Define 3 mesh density levels to control the number of searches.
+#define MESH_DENSITY_LEVELS 3
+static MESH_PATTERN
+ good_quality_mesh_patterns[MESH_DENSITY_LEVELS][MAX_MESH_STEP] = {
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const VP9_COMP *cpi) {
+ return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
+ unsigned int screen_area = (cm->width * cm->height);
+
+ // Select block size based on image format size.
+ if (screen_area < 1280 * 720) {
+ // Formats smaller in area than 720P
+ return BLOCK_4X4;
+ } else if (screen_area < 1920 * 1080) {
+ // Format >= 720P and < 1080P
+ return BLOCK_8X8;
+ } else {
+ // Formats 1080P and up
+ return BLOCK_16X16;
+ }
+}
+
+static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int min_frame_size = VPXMIN(cm->width, cm->height);
+ const int is_480p_or_larger = min_frame_size >= 480;
+ const int is_720p_or_larger = min_frame_size >= 720;
+ const int is_1080p_or_larger = min_frame_size >= 1080;
+ const int is_2160p_or_larger = min_frame_size >= 2160;
+ const int boosted = frame_is_boosted(cpi);
+
+ // speed 0 features
+ sf->partition_search_breakout_thr.dist = (1 << 20);
+ sf->partition_search_breakout_thr.rate = 80;
+ sf->use_square_only_thresh_high = BLOCK_SIZES;
+ sf->use_square_only_thresh_low = BLOCK_4X4;
+
+ if (is_480p_or_larger) {
+ // Currently, the machine-learning based partition search early termination
+ // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
+ sf->rd_ml_partition.search_early_termination = 1;
+ sf->recode_tolerance_high = 45;
+ } else {
+ sf->use_square_only_thresh_high = BLOCK_32X32;
+ }
+ if (is_720p_or_larger) {
+ sf->alt_ref_search_fp = 1;
+ }
+
+ if (!is_1080p_or_larger) {
+ sf->rd_ml_partition.search_breakout = 1;
+ if (is_720p_or_larger) {
+ sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+ sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f;
+ sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f;
+ } else {
+ sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f;
+ sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f;
+ sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f;
+ }
+ }
+
+ if (!is_720p_or_larger) {
+ if (is_480p_or_larger)
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+ else
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+ }
+
+ if (speed >= 1) {
+ sf->rd_ml_partition.search_early_termination = 0;
+ sf->rd_ml_partition.search_breakout = 1;
+ if (is_480p_or_larger)
+ sf->use_square_only_thresh_high = BLOCK_64X64;
+ else
+ sf->use_square_only_thresh_high = BLOCK_32X32;
+ sf->use_square_only_thresh_low = BLOCK_16X16;
+ if (is_720p_or_larger) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->partition_search_breakout_thr.dist = (1 << 22);
+ sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f;
+ sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f;
+ sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f;
+ } else {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ sf->partition_search_breakout_thr.dist = (1 << 21);
+ sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f;
+ sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+ sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f;
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f;
+ sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f;
+ sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+
+ if (speed >= 2) {
+ sf->use_square_only_thresh_high = BLOCK_4X4;
+ sf->use_square_only_thresh_low = BLOCK_SIZES;
+ if (is_720p_or_larger) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->partition_search_breakout_thr.dist = (1 << 24);
+ sf->partition_search_breakout_thr.rate = 120;
+ sf->rd_ml_partition.search_breakout = 0;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ sf->partition_search_breakout_thr.dist = (1 << 22);
+ sf->partition_search_breakout_thr.rate = 100;
+ sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+ sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+ sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f;
+ }
+ sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+
+ // Use a set of speed features for 4k videos.
+ if (is_2160p_or_larger) {
+ sf->use_square_partition_only = 1;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+ sf->alt_ref_search_fp = 1;
+ sf->cb_pred_filter_search = 2;
+ sf->adaptive_interp_filter_search = 1;
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ }
+ }
+
+ if (speed >= 3) {
+ sf->rd_ml_partition.search_breakout = 0;
+ if (is_720p_or_larger) {
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+ sf->partition_search_breakout_thr.dist = (1 << 25);
+ sf->partition_search_breakout_thr.rate = 200;
+ } else {
+ sf->max_intra_bsize = BLOCK_32X32;
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+ sf->partition_search_breakout_thr.dist = (1 << 23);
+ sf->partition_search_breakout_thr.rate = 120;
+ }
+ }
+
+ // If this is a two pass clip that fits the criteria for animated or
+ // graphics content then reset disable_split_mask for speeds 1-4.
+ // Also if the image edge is internal to the coded area.
+ if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ (vp9_internal_image_edge(cpi)))) {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ }
+
+ if (speed >= 4) {
+ sf->partition_search_breakout_thr.rate = 300;
+ if (is_720p_or_larger) {
+ sf->partition_search_breakout_thr.dist = (1 << 26);
+ } else {
+ sf->partition_search_breakout_thr.dist = (1 << 24);
+ }
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ }
+
+ if (speed >= 5) {
+ sf->partition_search_breakout_thr.rate = 500;
+ }
+}
+
+static double tx_dom_thresholds[6] = { 99.0, 14.0, 12.0, 8.0, 4.0, 0.0 };
+static double qopt_thresholds[6] = { 99.0, 12.0, 10.0, 4.0, 2.0, 0.0 };
+
+static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
+ VP9_COMMON *cm,
+ SPEED_FEATURES *sf,
+ int speed) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const int boosted = frame_is_boosted(cpi);
+ int i;
+
+ sf->adaptive_interp_filter_search = 1;
+ sf->adaptive_pred_interp_filter = 1;
+ sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh_row_mt = 0;
+ sf->allow_skip_recode = 1;
+ sf->less_rectangular_check = 1;
+ sf->mv.auto_mv_step_size = 1;
+ sf->mv.use_downsampled_sad = 1;
+ sf->prune_ref_frame_for_rect_partitions = 1;
+ sf->temporal_filter_search_method = NSTEP;
+ sf->tx_size_search_breakout = 1;
+ sf->use_square_partition_only = !boosted;
+ sf->early_term_interp_search_plane_rd = 1;
+ sf->cb_pred_filter_search = 1;
+ sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+ ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+ : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
+
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->comp_inter_joint_search_iter_level = 1;
+
+ // Reference masking is not supported in dynamic scaling mode.
+ sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
+
+ sf->rd_ml_partition.var_pruning = 1;
+ sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[1] = 350;
+ sf->rd_ml_partition.prune_rect_thresh[2] = 325;
+ sf->rd_ml_partition.prune_rect_thresh[3] = 250;
+
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ sf->exhaustive_searches_thresh = (1 << 22);
+ } else {
+ sf->exhaustive_searches_thresh = INT_MAX;
+ }
+
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ const int mesh_density_level = 0;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+
+ if (speed >= 1) {
+ sf->rd_ml_partition.var_pruning = !boosted;
+ sf->rd_ml_partition.prune_rect_thresh[1] = 225;
+ sf->rd_ml_partition.prune_rect_thresh[2] = 225;
+ sf->rd_ml_partition.prune_rect_thresh[3] = 225;
+
+ if (oxcf->pass == 2) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ vp9_internal_image_edge(cpi)) {
+ sf->use_square_partition_only = !boosted;
+ } else {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ }
+ } else {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ }
+
+ sf->allow_txfm_domain_distortion = 1;
+ sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
+ sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+ ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+ : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
+ sf->less_rectangular_check = 1;
+ sf->use_rd_breakout = 1;
+ sf->adaptive_motion_search = 1;
+ sf->adaptive_rd_thresh = 2;
+ sf->mv.subpel_search_level = 1;
+ if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
+ sf->allow_acl = 0;
+
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+
+ sf->recode_tolerance_low = 15;
+ sf->recode_tolerance_high = 30;
+
+ sf->exhaustive_searches_thresh =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
+ : INT_MAX;
+ sf->use_accurate_subpel_search = USE_4_TAPS;
+ }
+
+ if (speed >= 2) {
+ sf->rd_ml_partition.var_pruning = 0;
+ if (oxcf->vbr_corpus_complexity)
+ sf->recode_loop = ALLOW_RECODE_FIRST;
+ else
+ sf->recode_loop = ALLOW_RECODE_KFARFGF;
+
+ sf->tx_size_search_method =
+ frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
+
+ sf->mode_search_skip_flags =
+ (cm->frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
+ sf->disable_filter_search_var_thresh = 100;
+ sf->comp_inter_joint_search_iter_level = 2;
+ sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ sf->recode_tolerance_high = 45;
+ sf->enhanced_full_pixel_motion_search = 0;
+ sf->prune_ref_frame_for_rect_partitions = 0;
+ sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+ sf->mv.subpel_search_level = 0;
+
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ int mesh_density_level = 1;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+ }
+
+ sf->use_accurate_subpel_search = USE_2_TAPS;
+ }
+
+ if (speed >= 3) {
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
+ sf->cb_partition_search = !boosted;
+ sf->cb_pred_filter_search = 2;
+ sf->alt_ref_search_fp = 1;
+ sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+ sf->adaptive_rd_thresh = 3;
+ sf->mode_skip_start = 6;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
+
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ int mesh_density_level = 2;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[mesh_density_level][i].interval;
+ }
+ }
+ }
+
+ if (speed >= 4) {
+ sf->use_square_partition_only = 1;
+ sf->tx_size_search_method = USE_LARGESTALL;
+ sf->mv.search_method = BIGDIA;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->adaptive_rd_thresh = 4;
+ if (cm->frame_type != KEY_FRAME)
+ sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+ sf->disable_filter_search_var_thresh = 200;
+ sf->use_lp32x32fdct = 1;
+ sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
+ sf->use_fast_coef_costing = 1;
+ sf->motion_field_mode_search = !boosted;
+ }
+
+ if (speed >= 5) {
+ sf->optimize_coefficients = 0;
+ sf->mv.search_method = HEX;
+ sf->disable_filter_search_var_thresh = 500;
+ for (i = 0; i < TX_SIZES; ++i) {
+ sf->intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_uv_mode_mask[i] = INTRA_DC;
+ }
+ sf->mv.reduce_first_step_size = 1;
+ sf->simple_model_rd_from_var = 1;
+ }
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ VP9_COMMON *const cm = &cpi->common;
+
+ if (speed >= 1) {
+ if (VPXMIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ } else {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ }
+ }
+
+ if (speed >= 2) {
+ if (VPXMIN(cm->width, cm->height) >= 720) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ }
+ }
+
+ if (speed >= 5) {
+ sf->partition_search_breakout_thr.rate = 200;
+ if (VPXMIN(cm->width, cm->height) >= 720) {
+ sf->partition_search_breakout_thr.dist = (1 << 25);
+ } else {
+ sf->partition_search_breakout_thr.dist = (1 << 23);
+ }
+ }
+
+ if (speed >= 7) {
+ sf->encode_breakout_thresh =
+ (VPXMIN(cm->width, cm->height) >= 720) ? 800 : 300;
+ }
+}
+
+static void set_rt_speed_feature_framesize_independent(
+ VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ const int is_keyframe = cm->frame_type == KEY_FRAME;
+ const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
+ sf->static_segmentation = 0;
+ sf->adaptive_rd_thresh = 1;
+ sf->adaptive_rd_thresh_row_mt = 0;
+ sf->use_fast_coef_costing = 1;
+ sf->exhaustive_searches_thresh = INT_MAX;
+ sf->allow_acl = 0;
+ sf->copy_partition_flag = 0;
+ sf->use_source_sad = 0;
+ sf->use_simple_block_yrd = 0;
+ sf->adapt_partition_source_sad = 0;
+ sf->use_altref_onepass = 0;
+ sf->use_compound_nonrd_pickmode = 0;
+ sf->nonrd_keyframe = 0;
+ sf->svc_use_lowres_part = 0;
+ sf->overshoot_detection_cbr_rt = NO_DETECTION;
+ sf->disable_16x16part_nonkey = 0;
+ sf->disable_golden_ref = 0;
+ sf->enable_tpl_model = 0;
+ sf->enhanced_full_pixel_motion_search = 0;
+ sf->use_accurate_subpel_search = USE_2_TAPS;
+ sf->nonrd_use_ml_partition = 0;
+ sf->variance_part_thresh_mult = 1;
+ sf->cb_pred_filter_search = 0;
+ sf->force_smooth_interpol = 0;
+ sf->rt_intra_dc_only_low_content = 0;
+ sf->mv.enable_adaptive_subpel_force_stop = 0;
+
+ if (speed >= 1) {
+ sf->allow_txfm_domain_distortion = 1;
+ sf->tx_domain_thresh = 0.0;
+ sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = 0.0;
+ sf->use_square_partition_only = !frame_is_intra_only(cm);
+ sf->less_rectangular_check = 1;
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+
+ sf->use_rd_breakout = 1;
+
+ sf->adaptive_motion_search = 1;
+ sf->adaptive_pred_interp_filter = 1;
+ sf->mv.auto_mv_step_size = 1;
+ sf->adaptive_rd_thresh = 2;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+
+ if (speed >= 2) {
+ sf->mode_search_skip_flags =
+ (cm->frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
+ sf->adaptive_pred_interp_filter = 2;
+
+ // Reference masking only enabled for 1 spatial layer, and if none of the
+ // references have been scaled. The latter condition needs to be checked
+ // for external or internal dynamic resize.
+ sf->reference_masking = (svc->number_spatial_layers == 1);
+ if (sf->reference_masking == 1 &&
+ (cpi->external_resize == 1 ||
+ cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ if (yv12 != NULL &&
+ (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+ const struct scale_factors *const scale_fac =
+ &cm->frame_refs[ref_frame - 1].sf;
+ if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
+ }
+ }
+ }
+
+ sf->disable_filter_search_var_thresh = 50;
+ sf->comp_inter_joint_search_iter_level = 2;
+ sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
+ sf->adjust_partitioning_from_last_frame = 1;
+ sf->last_partitioning_redo_frequency = 3;
+ sf->use_lp32x32fdct = 1;
+ sf->mode_skip_start = 11;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ }
+
+ if (speed >= 3) {
+ sf->use_square_partition_only = 1;
+ sf->disable_filter_search_var_thresh = 100;
+ sf->use_uv_intra_rd_estimate = 1;
+ sf->skip_encode_sb = 1;
+ sf->mv.subpel_search_level = 0;
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_skip_start = 6;
+ sf->allow_skip_recode = 0;
+ sf->optimize_coefficients = 0;
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->lpf_pick = LPF_PICK_FROM_Q;
+ }
+
+ if (speed >= 4) {
+ int i;
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0)
+ sf->use_altref_onepass = 1;
+ sf->mv.subpel_force_stop = QUARTER_PEL;
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[i] = INTRA_DC;
+ }
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->frame_parameter_update = 0;
+ sf->mv.search_method = FAST_HEX;
+ sf->allow_skip_recode = 0;
+ sf->max_intra_bsize = BLOCK_32X32;
+ sf->use_fast_coef_costing = 0;
+ sf->use_quant_fp = !is_keyframe;
+ sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+ sf->adaptive_rd_thresh = 2;
+ sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+ sf->partition_search_type = VAR_BASED_PARTITION;
+ }
+
+ if (speed >= 5) {
+ sf->use_altref_onepass = 0;
+ sf->use_quant_fp = !is_keyframe;
+ sf->auto_min_max_partition_size =
+ is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
+ sf->default_max_partition_size = BLOCK_32X32;
+ sf->default_min_partition_size = BLOCK_8X8;
+ sf->force_frame_boost =
+ is_keyframe ||
+ (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
+ sf->max_delta_qindex = is_keyframe ? 20 : 15;
+ sf->partition_search_type = REFERENCE_PARTITION;
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+ cpi->rc.is_src_frame_alt_ref) {
+ sf->partition_search_type = VAR_BASED_PARTITION;
+ }
+ sf->use_nonrd_pick_mode = 1;
+ sf->allow_skip_recode = 0;
+ sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
+ sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+ sf->adaptive_rd_thresh = 2;
+ // This feature is only enabled when partition search is disabled.
+ sf->reuse_inter_pred_sby = 1;
+ sf->coeff_prob_appx_step = 4;
+ sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+ sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
+ sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
+ sf->simple_model_rd_from_var = 1;
+ if (cpi->oxcf.rc_mode == VPX_VBR) sf->mv.search_method = NSTEP;
+
+ if (!is_keyframe) {
+ int i;
+ if (content == VP9E_CONTENT_SCREEN) {
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ if (i >= BLOCK_32X32)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+ else
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+ } else {
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ if (i > BLOCK_16X16)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+ else
+ // Use H and V intra mode for block sizes <= 16X16.
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+ }
+ }
+ if (content == VP9E_CONTENT_SCREEN) {
+ sf->short_circuit_flat_blocks = 1;
+ }
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN) {
+ sf->limit_newmv_early_exit = 1;
+ if (!cpi->use_svc) sf->bias_golden = 1;
+ }
+ // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent
+ // increase in encoding time.
+ if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1;
+ if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+ cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr) {
+ if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+ sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ;
+ else
+ sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
+ }
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+ cm->width <= 1280 && cm->height <= 720) {
+ sf->use_altref_onepass = 1;
+ sf->use_compound_nonrd_pickmode = 1;
+ }
+ if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
+ if (!cpi->external_resize) sf->use_source_sad = 1;
+ }
+
+ if (speed >= 6) {
+ if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
+ sf->use_altref_onepass = 1;
+ sf->use_compound_nonrd_pickmode = 1;
+ }
+ sf->partition_search_type = VAR_BASED_PARTITION;
+ sf->mv.search_method = NSTEP;
+ sf->mv.reduce_first_step_size = 1;
+ sf->skip_encode_sb = 0;
+
+ if (sf->use_source_sad) {
+ sf->adapt_partition_source_sad = 1;
+ sf->adapt_partition_thresh =
+ (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
+ if (cpi->content_state_sb_fd == NULL &&
+ (!cpi->use_svc ||
+ svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
+ CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
+ (uint8_t *)vpx_calloc(
+ (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(uint8_t)));
+ }
+ }
+ if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
+ // Enable short circuit for low temporal variance.
+ sf->short_circuit_low_temp_var = 1;
+ }
+ if (svc->temporal_layer_id > 0) {
+ sf->adaptive_rd_thresh = 4;
+ sf->limit_newmv_early_exit = 0;
+ sf->base_mv_aggressive = 1;
+ }
+ if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+ cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.disable_overshoot_maxq_cbr)
+ sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
+ }
+
+ if (speed >= 7) {
+ sf->adapt_partition_source_sad = 0;
+ sf->adaptive_rd_thresh = 3;
+ sf->mv.search_method = FAST_DIAMOND;
+ sf->mv.fullpel_search_step_param = 10;
+ // For SVC: use better mv search on base temporal layer, and only
+ // on base spatial layer if highest resolution is above 640x360.
+ if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 &&
+ (svc->spatial_layer_id == 0 ||
+ cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
+ sf->mv.search_method = NSTEP;
+ sf->mv.fullpel_search_step_param = 6;
+ }
+ if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) {
+ sf->use_simple_block_yrd = 1;
+ if (svc->non_reference_frame)
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
+ }
+ if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+ sf->adaptive_rd_thresh_row_mt = 1;
+ // Enable partition copy. For SVC only enabled for top spatial resolution
+ // layer.
+ cpi->max_copied_frame = 0;
+ if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+ !cpi->external_resize &&
+ (!cpi->use_svc ||
+ (svc->spatial_layer_id == svc->number_spatial_layers - 1 &&
+ !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) {
+ sf->copy_partition_flag = 1;
+ cpi->max_copied_frame = 2;
+ // The top temporal enhancement layer (for number of temporal layers > 1)
+ // are non-reference frames, so use large/max value for max_copied_frame.
+ if (svc->number_temporal_layers > 1 &&
+ svc->temporal_layer_id == svc->number_temporal_layers - 1)
+ cpi->max_copied_frame = 255;
+ }
+ // For SVC: enable use of lower resolution partition for higher resolution,
+ // only for 3 spatial layers and when config/top resolution is above VGA.
+ // Enable only for non-base temporal layer frames.
+ if (cpi->use_svc && svc->use_partition_reuse &&
+ svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 &&
+ cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
+ sf->svc_use_lowres_part = 1;
+ // For SVC when golden is used as second temporal reference: to avoid
+ // encode time increase only use this feature on base temporal layer.
+ // (i.e remove golden flag from frame_flags for temporal_layer_id > 0).
+ if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+ svc->temporal_layer_id > 0)
+ cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+ if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
+ }
+
+ if (speed >= 8) {
+ sf->adaptive_rd_thresh = 4;
+ sf->skip_encode_sb = 1;
+ if (cpi->svc.number_spatial_layers > 1 && !cpi->svc.simulcast_mode)
+ sf->nonrd_keyframe = 0;
+ else
+ sf->nonrd_keyframe = 1;
+ if (!cpi->use_svc) cpi->max_copied_frame = 4;
+ if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+ sf->adaptive_rd_thresh_row_mt = 1;
+ // Enable ML based partition for low res.
+ if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) {
+ sf->nonrd_use_ml_partition = 1;
+ }
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+ sf->nonrd_use_ml_partition = 0;
+#endif
+ if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL;
+ sf->rt_intra_dc_only_low_content = 1;
+ if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR &&
+ content != VP9E_CONTENT_SCREEN) {
+ // More aggressive short circuit for speed 8.
+ sf->short_circuit_low_temp_var = 3;
+ // Use level 2 for noisey cases as there is a regression in some
+ // noisy clips with level 3.
+ if (cpi->noise_estimate.enabled && cm->width >= 1280 &&
+ cm->height >= 720) {
+ NOISE_LEVEL noise_level =
+ vp9_noise_estimate_extract_level(&cpi->noise_estimate);
+ if (noise_level >= kMedium) sf->short_circuit_low_temp_var = 2;
+ }
+ // Since the short_circuit_low_temp_var is used, reduce the
+ // adaptive_rd_thresh level.
+ if (cm->width * cm->height > 352 * 288)
+ sf->adaptive_rd_thresh = 1;
+ else
+ sf->adaptive_rd_thresh = 2;
+ }
+ sf->limit_newmv_early_exit = 0;
+ sf->use_simple_block_yrd = 1;
+ if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
+ }
+
+ if (speed >= 9) {
+ // Only keep INTRA_DC mode for speed 9.
+ if (!is_keyframe) {
+ int i = 0;
+ for (i = 0; i < BLOCK_SIZES; ++i)
+ sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+ }
+ sf->cb_pred_filter_search = 2;
+ sf->mv.enable_adaptive_subpel_force_stop = 1;
+ sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
+ sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
+ sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL;
+ // Disable partition blocks below 16x16, except for low-resolutions.
+ if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240)
+ sf->disable_16x16part_nonkey = 1;
+ // Allow for disabling GOLDEN reference, for CBR mode.
+ if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1;
+ if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR;
+ if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2;
+ }
+
+ // Disable split to 8x8 for low-resolution at very high Q.
+ // For variance partition (speed >= 6). Ignore the first few frames
+ // as avg_frame_qindex starts at max_q (worst_quality).
+ if (cm->frame_type != KEY_FRAME && cm->width * cm->height <= 320 * 240 &&
+ sf->partition_search_type == VAR_BASED_PARTITION &&
+ cpi->rc.avg_frame_qindex[INTER_FRAME] > 208 &&
+ cpi->common.current_video_frame > 8)
+ sf->disable_16x16part_nonkey = 1;
+
+ if (sf->nonrd_use_ml_partition)
+ sf->partition_search_type = ML_BASED_PARTITION;
+
+ if (sf->use_altref_onepass) {
+ if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
+ sf->partition_search_type = FIXED_PARTITION;
+ sf->always_this_block_size = BLOCK_64X64;
+ }
+ if (cpi->count_arf_frame_usage == NULL) {
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->count_arf_frame_usage,
+ (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->count_arf_frame_usage)));
+ }
+ if (cpi->count_lastgolden_frame_usage == NULL)
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->count_lastgolden_frame_usage,
+ (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+ sizeof(*cpi->count_lastgolden_frame_usage)));
+ }
+ if (svc->previous_frame_is_intra_only) {
+ sf->partition_search_type = FIXED_PARTITION;
+ sf->always_this_block_size = BLOCK_64X64;
+ }
+ // Special case for screen content: increase motion search on base spatial
+ // layer when high motion is detected or previous SL0 frame was dropped.
+ if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 &&
+ (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) {
+ sf->mv.search_method = NSTEP;
+ // TODO(marpan/jianj): Tune this setting for screensharing. For now use
+ // small step_param for all spatial layers.
+ sf->mv.fullpel_search_step_param = 2;
+ }
+ // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it
+ // off for now.
+ if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ cpi->oxcf.aq_mode = 0;
+}
+
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+
+ // best quality defaults
+ // Some speed-up features even for best quality as minimal impact on quality.
+ sf->partition_search_breakout_thr.dist = (1 << 19);
+ sf->partition_search_breakout_thr.rate = 80;
+ sf->rd_ml_partition.search_early_termination = 0;
+ sf->rd_ml_partition.search_breakout = 0;
+
+ if (oxcf->mode == REALTIME)
+ set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+#if !CONFIG_REALTIME_ONLY
+ else if (oxcf->mode == GOOD)
+ set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+#endif
+
+ if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+ sf->adaptive_pred_interp_filter = 0;
+ }
+
+ if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+ sf->encode_breakout_thresh > cpi->encode_breakout) {
+ cpi->encode_breakout = sf->encode_breakout_thresh;
+ }
+
+ // Check for masked out split cases.
+ for (i = 0; i < MAX_REFS; ++i) {
+ if (sf->disable_split_mask & (1 << i)) {
+ rd->thresh_mult_sub8x8[i] = INT_MAX;
+ }
+ }
+
+ // With row based multi-threading, the following speed features
+ // have to be disabled to guarantee that bitstreams encoded with single thread
+ // and multiple threads match.
+ // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+ // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+ oxcf->max_threads > 1)
+ sf->adaptive_rd_thresh = 0;
+}
+
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+#if !CONFIG_REALTIME_ONLY
+ VP9_COMMON *const cm = &cpi->common;
+#endif
+ MACROBLOCK *const x = &cpi->td.mb;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ int i;
+
+ // best quality defaults
+ sf->frame_parameter_update = 1;
+ sf->mv.search_method = NSTEP;
+ sf->recode_loop = ALLOW_RECODE_FIRST;
+ sf->mv.subpel_search_method = SUBPEL_TREE;
+ sf->mv.subpel_search_level = 2;
+ sf->mv.subpel_force_stop = EIGHTH_PEL;
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
+ sf->mv.reduce_first_step_size = 0;
+ sf->coeff_prob_appx_step = 1;
+ sf->mv.auto_mv_step_size = 0;
+ sf->mv.fullpel_search_step_param = 6;
+ sf->mv.use_downsampled_sad = 0;
+ sf->comp_inter_joint_search_iter_level = 0;
+ sf->tx_size_search_method = USE_FULL_RD;
+ sf->use_lp32x32fdct = 0;
+ sf->adaptive_motion_search = 0;
+ sf->enhanced_full_pixel_motion_search = 1;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
+ sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
+ sf->cb_pred_filter_search = 0;
+ sf->early_term_interp_search_plane_rd = 0;
+ sf->cb_partition_search = 0;
+ sf->motion_field_mode_search = 0;
+ sf->alt_ref_search_fp = 0;
+ sf->use_quant_fp = 0;
+ sf->reference_masking = 0;
+ sf->partition_search_type = SEARCH_PARTITION;
+ sf->less_rectangular_check = 0;
+ sf->use_square_partition_only = 0;
+ sf->use_square_only_thresh_high = BLOCK_SIZES;
+ sf->use_square_only_thresh_low = BLOCK_4X4;
+ sf->auto_min_max_partition_size = NOT_IN_USE;
+ sf->rd_auto_partition_min_limit = BLOCK_4X4;
+ sf->default_max_partition_size = BLOCK_64X64;
+ sf->default_min_partition_size = BLOCK_4X4;
+ sf->adjust_partitioning_from_last_frame = 0;
+ sf->last_partitioning_redo_frequency = 4;
+ sf->disable_split_mask = 0;
+ sf->mode_search_skip_flags = 0;
+ sf->force_frame_boost = 0;
+ sf->max_delta_qindex = 0;
+ sf->disable_filter_search_var_thresh = 0;
+ sf->adaptive_interp_filter_search = 0;
+ sf->allow_txfm_domain_distortion = 0;
+ sf->tx_domain_thresh = 99.0;
+ sf->trellis_opt_tx_rd.method =
+ sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+ sf->trellis_opt_tx_rd.thresh = 99.0;
+ sf->allow_acl = 1;
+ sf->enable_tpl_model = oxcf->enable_tpl_model;
+ sf->prune_ref_frame_for_rect_partitions = 0;
+ sf->temporal_filter_search_method = MESH;
+ sf->allow_skip_txfm_ac_dc = 0;
+
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_ALL;
+ sf->intra_uv_mode_mask[i] = INTRA_ALL;
+ }
+ sf->use_rd_breakout = 0;
+ sf->skip_encode_sb = 0;
+ sf->use_uv_intra_rd_estimate = 0;
+ sf->allow_skip_recode = 0;
+ sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+ sf->use_fast_coef_updates = TWO_LOOP;
+ sf->use_fast_coef_costing = 0;
+ sf->mode_skip_start = MAX_MODES; // Mode index at which mode skip mask set
+ sf->schedule_mode_search = 0;
+ sf->use_nonrd_pick_mode = 0;
+ for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL;
+ sf->max_intra_bsize = BLOCK_64X64;
+ sf->reuse_inter_pred_sby = 0;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ sf->always_this_block_size = BLOCK_16X16;
+ sf->search_type_check_frequency = 50;
+ sf->encode_breakout_thresh = 0;
+ // Recode loop tolerance %.
+ sf->recode_tolerance_low = 12;
+ sf->recode_tolerance_high = 25;
+ sf->default_interp_filter = SWITCHABLE;
+ sf->simple_model_rd_from_var = 0;
+ sf->short_circuit_flat_blocks = 0;
+ sf->short_circuit_low_temp_var = 0;
+ sf->limit_newmv_early_exit = 0;
+ sf->bias_golden = 0;
+ sf->base_mv_aggressive = 0;
+ sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+ sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+ sf->rd_ml_partition.var_pruning = 0;
+ sf->use_accurate_subpel_search = USE_8_TAPS;
+
+ // Some speed-up features even for best quality as minimal impact on quality.
+ sf->adaptive_rd_thresh = 1;
+ sf->tx_size_search_breakout = 1;
+ sf->tx_size_search_depth = 2;
+
+ sf->exhaustive_searches_thresh =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
+ : INT_MAX;
+ {
+ const int mesh_density_level =
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range =
+ best_quality_mesh_pattern[mesh_density_level][i].range;
+ sf->mesh_patterns[i].interval =
+ best_quality_mesh_pattern[mesh_density_level][i].interval;
+ }
+ }
+
+ if (oxcf->mode == REALTIME)
+ set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content);
+#if !CONFIG_REALTIME_ONLY
+ else if (oxcf->mode == GOOD)
+ set_good_speed_feature_framesize_independent(cpi, cm, sf, speed);
+#endif
+
+ cpi->diamond_search_sad = vp9_diamond_search_sad;
+
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (oxcf->pass == 1) sf->optimize_coefficients = 0;
+
+ // No recode for 1 pass.
+ if (oxcf->pass == 0) {
+ sf->recode_loop = DISALLOW_RECODE;
+ sf->optimize_coefficients = 0;
+ }
+
+ if (sf->mv.subpel_force_stop == FULL_PEL) {
+ // Whole pel only
+ cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_more;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+ cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
+ }
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test == 1)
+ cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
+ else if (cpi->oxcf.motion_vector_unit_test == 2)
+ cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
+
+ x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+
+ x->min_partition_size = sf->default_min_partition_size;
+ x->max_partition_size = sf->default_max_partition_size;
+
+ if (!cpi->oxcf.frame_periodic_boost) {
+ sf->max_delta_qindex = 0;
+ }
+
+ // With row based multi-threading, the following speed features
+ // have to be disabled to guarantee that bitstreams encoded with single thread
+ // and multiple threads match.
+ // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
+ // adaptive_rd_thresh is defined per-row for non-rd pickmode.
+ if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+ oxcf->max_threads > 1)
+ sf->adaptive_rd_thresh = 0;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h
new file mode 100644
index 0000000000..70c61fe00d
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h
@@ -0,0 +1,674 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
+
+#include "vp9/common/vp9_enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+ (1 << D135_PRED) | (1 << D117_PRED) | (1 << D153_PRED) |
+ (1 << D207_PRED) | (1 << D63_PRED) | (1 << TM_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << TM_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_TM_H_V =
+ (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST = (1 << NEARESTMV),
+ INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV),
+ INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV),
+ INTER_NEAREST_NEW_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+typedef enum {
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2,
+ BIGDIA = 3,
+ SQUARE = 4,
+ FAST_HEX = 5,
+ FAST_DIAMOND = 6,
+ MESH = 7
+} SEARCH_METHODS;
+
+typedef enum {
+ // No recode.
+ DISALLOW_RECODE = 0,
+ // Allow recode for KF and exceeding maximum frame bandwidth.
+ ALLOW_RECODE_KFMAXBW = 1,
+ // Allow recode only for KF/ARF/GF frames.
+ ALLOW_RECODE_KFARFGF = 2,
+ // Allow recode for ARF/GF/KF and first normal frame in each group.
+ ALLOW_RECODE_FIRST = 3,
+ // Allow recode for all frames based on bitrate constraints.
+ ALLOW_RECODE = 4,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+ SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
+ SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
+ SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches
+ // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+ NO_MOTION_THRESHOLD = 0,
+ LOW_MOTION_THRESHOLD = 7
+} MOTION_THRESHOLD;
+
+typedef enum {
+ USE_FULL_RD = 0,
+ USE_LARGESTALL,
+ USE_TX_8X8
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+ NOT_IN_USE = 0,
+ RELAXED_NEIGHBORING_MIN_MAX = 1,
+ STRICT_NEIGHBORING_MIN_MAX = 2
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+ // Try the full image with different values.
+ LPF_PICK_FROM_FULL_IMAGE,
+ // Try a small portion of the image with different values.
+ LPF_PICK_FROM_SUBIMAGE,
+ // Estimate the level based on quantizer and frame type
+ LPF_PICK_FROM_Q,
+ // Pick 0 to disable LPF if LPF was enabled last frame
+ LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+ // Terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1 << 0,
+
+ // Skips comp inter modes if the best so far is an intra mode.
+ FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+ // Skips oblique intra modes if the best so far is an inter mode.
+ FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+ // Skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions.
+ FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+ // Skips intra modes other than DC_PRED if the source variance is small
+ FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+ FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+ FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+ FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+ // Search partitions using RD/NONRD criterion.
+ SEARCH_PARTITION,
+
+ // Always use a fixed size partition.
+ FIXED_PARTITION,
+
+ REFERENCE_PARTITION,
+
+ // Use an arbitrary partitioning scheme based on source variance within
+ // a 64X64 SB.
+ VAR_BASED_PARTITION,
+
+ // Use non-fixed partitions based on source variance.
+ SOURCE_VAR_BASED_PARTITION,
+
+ // Make partition decisions with machine learning models.
+ ML_BASED_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef enum {
+ // Does a dry run to see if any of the contexts need to be updated or not,
+ // before the final run.
+ TWO_LOOP = 0,
+
+ // No dry run, also only half the coef contexts and bands are updated.
+ // The rest are not updated at all.
+ ONE_LOOP_REDUCED = 1
+} FAST_COEFF_UPDATE;
+
+typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP;
+
+typedef struct ADAPT_SUBPEL_FORCE_STOP {
+ // Threshold for full pixel motion vector;
+ int mv_thresh;
+
+ // subpel_force_stop if full pixel MV is below the threshold.
+ SUBPEL_FORCE_STOP force_stop_below;
+
+ // subpel_force_stop if full pixel MV is equal to or above the threshold.
+ SUBPEL_FORCE_STOP force_stop_above;
+} ADAPT_SUBPEL_FORCE_STOP;
+
+typedef struct MV_SPEED_FEATURES {
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+ SEARCH_METHODS search_method;
+
+ // This parameter controls which step in the n-step process we start at.
+ // It's changed adaptively based on circumstances.
+ int reduce_first_step_size;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
+ int auto_mv_step_size;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
+ SUBPEL_SEARCH_METHODS subpel_search_method;
+
+ // Subpel MV search level. Can take values 0 - 2. Higher values mean more
+ // extensive subpel search.
+ int subpel_search_level;
+
+ // When to stop subpel motion search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
+
+ // If it's enabled, different subpel_force_stop will be used for different MV.
+ int enable_adaptive_subpel_force_stop;
+
+ ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop;
+
+ // This variable sets the step_param used in full pel motion search.
+ int fullpel_search_step_param;
+
+ // Whether to downsample the rows in sad calculation during motion search.
+ // This is only active when there are at least 8 rows.
+ int use_downsampled_sad;
+} MV_SPEED_FEATURES;
+
+typedef struct PARTITION_SEARCH_BREAKOUT_THR {
+ int64_t dist;
+ int rate;
+} PARTITION_SEARCH_BREAKOUT_THR;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
+typedef enum {
+ // No reaction to rate control on a detected slide/scene change.
+ NO_DETECTION = 0,
+
+ // Set to larger Q (max_q set by user) based only on the
+ // detected slide/scene change and current/past Q.
+ FAST_DETECTION_MAXQ = 1,
+
+ // Based on (first pass) encoded frame, if large frame size is detected
+ // then set to higher Q for the second re-encode. This involves 2 pass
+ // encoding on slide change, so slower than 1, but more accurate for
+ // detecting overshoot.
+ RE_ENCODE_MAXQ = 2
+} OVERSHOOT_DETECTION_CBR_RT;
+
+typedef enum {
+ USE_2_TAPS = 0,
+ USE_4_TAPS,
+ USE_8_TAPS,
+ USE_8_TAPS_SHARP,
+} SUBPEL_SEARCH_TYPE;
+
+typedef enum {
+ // Disable trellis coefficient optimization
+ DISABLE_TRELLIS_OPT,
+ // Enable trellis coefficient optimization
+ ENABLE_TRELLIS_OPT,
+ // Enable trellis coefficient optimization based on source variance of the
+ // prediction block during transform RD
+ ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+ // Enable trellis coefficient optimization based on residual mse of the
+ // transform block during transform RD
+ ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+ ENABLE_TRELLIS_OPT_METHOD method;
+ double thresh;
+} TRELLIS_OPT_CONTROL;
+
+typedef struct SPEED_FEATURES {
+ MV_SPEED_FEATURES mv;
+
+ // Frame level coding parameter update
+ int frame_parameter_update;
+
+ RECODE_LOOP_TYPE recode_loop;
+
+ // Trellis (dynamic programming) optimization of quantized values (+1, 0).
+ int optimize_coefficients;
+
+ // Always set to 0. If on it enables 0 cost background transmission
+ // (except for the initial transmission of the segmentation). The feature is
+ // disabled because the addition of very large block sizes make the
+ // backgrounds very to cheap to encode, and the segmentation we have
+ // adds overhead.
+ int static_segmentation;
+
+ // The best compound predictor is found using an iterative log search process
+ // that searches for best ref0 mv using error of combined predictor and then
+ // searches for best ref1 mv. This sf determines the number of iterations of
+ // this process based on block size. The sf becomes more aggressive from level
+ // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+ // -----------------------------------------------
+ // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+ // | 0 | 4 | 4 | 4 |
+ // | 1 | 0 | 2 | 4 |
+ // | 2 | 0 | 0 | 0 |
+ // -----------------------------------------------
+ // Here, 0 iterations indicate using the best single motion vector selected
+ // for each ref frame without any iterative refinement.
+ int comp_inter_joint_search_iter_level;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
+ // Turned off when (row_mt_bit_exact == 1 && adaptive_rd_thresh_row_mt == 0).
+ int adaptive_rd_thresh;
+
+ // Flag to use adaptive_rd_thresh when row-mt it enabled, only for non-rd
+ // pickmode.
+ int adaptive_rd_thresh_row_mt;
+
+ // Enables skipping the reconstruction step (idct, recon) in the
+ // intermediate steps assuming the last frame didn't have too many intra
+ // blocks and the q is less than a threshold.
+ int skip_encode_sb;
+ int skip_encode_frame;
+ // Speed feature to allow or disallow skipping of recode at block
+ // level within a frame.
+ int allow_skip_recode;
+
+ // Coefficient probability model approximation step size
+ int coeff_prob_appx_step;
+
+ // Enable uniform quantizer followed by trellis coefficient optimization
+ // during transform RD
+ TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
+
+ // Enable asymptotic closed-loop encoding decision for key frame and
+ // alternate reference frames.
+ int allow_acl;
+
+ // Temporal dependency model based encoding mode optimization
+ int enable_tpl_model;
+
+ // Use transform domain distortion. Use pixel domain distortion in speed 0
+ // and certain situations in higher speed to improve the RD model precision.
+ int allow_txfm_domain_distortion;
+ double tx_domain_thresh;
+
+ // The threshold is to determine how slow the motino is, it is used when
+ // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
+ MOTION_THRESHOLD lf_motion_threshold;
+
+ // Determine which method we use to determine transform size. We can choose
+ // between options like full rd, largest for prediction size, largest
+ // for intra and model coefs for the rest.
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+ // How many levels of tx size to search, starting from the largest.
+ int tx_size_search_depth;
+
+ // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
+ // precise but significantly faster than the non lp version.
+ int use_lp32x32fdct;
+
+ // After looking at the first set of modes (set by index here), skip
+ // checking modes for reference frames that don't match the reference frame
+ // of the best so far.
+ int mode_skip_start;
+
+ // TODO(JBB): Remove this.
+ int reference_masking;
+
+ PARTITION_SEARCH_TYPE partition_search_type;
+
+ // Used if partition_search_type = FIXED_SIZE_PARTITION
+ BLOCK_SIZE always_this_block_size;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split.
+ int less_rectangular_check;
+
+ // Disable testing non square partitions(eg 16x32) for block sizes larger than
+ // use_square_only_thresh_high or smaller than use_square_only_thresh_low.
+ int use_square_partition_only;
+ BLOCK_SIZE use_square_only_thresh_high;
+ BLOCK_SIZE use_square_only_thresh_low;
+
+ // Prune reference frames for rectangular partitions.
+ int prune_ref_frame_for_rect_partitions;
+
+ // Sets min and max partition sizes for this 64x64 region based on the
+ // same 64x64 in last encoded frame, and the left and above neighbor.
+ AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+ // Ensures the rd based auto partition search will always
+ // go down at least to the specified level.
+ BLOCK_SIZE rd_auto_partition_min_limit;
+
+ // Min and max partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
+
+ // Whether or not we allow partitions one smaller or one greater than the last
+ // frame's partitioning. Only used if use_lastframe_partitioning is set.
+ int adjust_partitioning_from_last_frame;
+
+ // How frequently we re do the partitioning from scratch. Only used if
+ // use_lastframe_partitioning is set.
+ int last_partitioning_redo_frequency;
+
+ // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+ // it always, to allow it for only Last frame and Intra, disable it for all
+ // inter modes or to enable it always.
+ int disable_split_mask;
+
+ // TODO(jingning): combine the related motion search speed features
+ // This allows us to use motion search at other sizes as a starting
+ // point for this motion search and limits the search range around it.
+ int adaptive_motion_search;
+
+ // Do extra full pixel motion search to obtain better motion vector.
+ int enhanced_full_pixel_motion_search;
+
+ // Threshold for allowing exhaistive motion search.
+ int exhaustive_searches_thresh;
+
+ // Pattern to be used for any exhaustive mesh searches.
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+ int schedule_mode_search;
+
+ // Allows sub 8x8 modes to use the prediction filter that was determined
+ // best for 8x8 mode. If set to 0 we always re check all the filters for
+ // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+ // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+ int adaptive_pred_interp_filter;
+
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
+ // Prune NEAREST and ZEROMV single reference modes based on motion vector
+ // difference and mode rate
+ int prune_single_mode_based_on_mv_diff_mode_rate;
+
+ // Chessboard pattern prediction for interp filter. Aggressiveness increases
+ // with levels.
+ // 0: disable
+ // 1: cb pattern in eval when filter is not switchable
+ // 2: cb pattern prediction for filter search
+ int cb_pred_filter_search;
+
+ // This variable enables an early termination of interpolation filter eval
+ // based on the current rd cost after processing each plane
+ int early_term_interp_search_plane_rd;
+
+ int cb_partition_search;
+
+ int motion_field_mode_search;
+
+ int alt_ref_search_fp;
+
+ // Fast quantization process path
+ int use_quant_fp;
+
+ // Use finer quantizer in every other few frames that run variable block
+ // partition type search.
+ int force_frame_boost;
+
+ // Maximally allowed base quantization index fluctuation.
+ int max_delta_qindex;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+
+ // A source variance threshold below which filter search is disabled
+ // Choose a very large value (UINT_MAX) to use 8-tap always
+ unsigned int disable_filter_search_var_thresh;
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // prediction block size separately.
+ int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
+ // This variable enables an early break out of mode testing if the model for
+ // rd built from the prediction signal indicates a value that's much
+ // higher than the best rd we've seen so far.
+ int use_rd_breakout;
+
+ // This enables us to use an estimate for intra rd based on dc mode rather
+ // than choosing an actual uv mode in the stage of encoding before the actual
+ // final encode.
+ int use_uv_intra_rd_estimate;
+
+ // This feature controls how the loop filter level is determined.
+ LPF_PICK_METHOD lpf_pick;
+
+ // This feature limits the number of coefficients updates we actually do
+ // by only looking at counts from 1/2 the bands.
+ FAST_COEFF_UPDATE use_fast_coef_updates;
+
+ // This flag controls the use of non-RD mode decision.
+ int use_nonrd_pick_mode;
+
+ // A binary mask indicating if NEARESTMV, NEARMV, ZEROMV, NEWMV
+ // modes are used in order from LSB to MSB for each BLOCK_SIZE.
+ int inter_mode_mask[BLOCK_SIZES];
+
+ // This feature controls whether we do the expensive context update and
+ // calculation in the rd coefficient costing loop.
+ int use_fast_coef_costing;
+
+ // This feature controls the tolerence vs target used in deciding whether to
+ // recode a frame. It has no meaning if recode is disabled.
+ int recode_tolerance_low;
+ int recode_tolerance_high;
+
+ // This variable controls the maximum block size where intra blocks can be
+ // used in inter frames.
+ // TODO(aconverse): Fold this into one of the other many mode skips
+ BLOCK_SIZE max_intra_bsize;
+
+ // The frequency that we check if SOURCE_VAR_BASED_PARTITION or
+ // FIXED_PARTITION search type should be used.
+ int search_type_check_frequency;
+
+ // When partition is pre-set, the inter prediction result from pick_inter_mode
+ // can be reused in final block encoding process. It is enabled only for real-
+ // time mode speed 6.
+ int reuse_inter_pred_sby;
+
+ // This variable sets the encode_breakout threshold. Currently, it is only
+ // enabled in real time mode.
+ int encode_breakout_thresh;
+
+ // default interp filter choice
+ INTERP_FILTER default_interp_filter;
+
+ // Early termination in transform size search, which only applies while
+ // tx_size_search_method is USE_FULL_RD.
+ int tx_size_search_breakout;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // mask for skip evaluation of certain interp_filter type.
+ INTERP_FILTER_MASK interp_filter_search_mask;
+
+ // Partition search early breakout thresholds.
+ PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr;
+
+ struct {
+ // Use ML-based partition search early breakout.
+ int search_breakout;
+ // Higher values mean more aggressiveness for partition search breakout that
+ // results in better encoding speed but worse compression performance.
+ float search_breakout_thresh[3];
+
+ // Machine-learning based partition search early termination
+ int search_early_termination;
+
+ // Machine-learning based partition search pruning using prediction residue
+ // variance.
+ int var_pruning;
+
+ // Threshold values used for ML based rectangular partition search pruning.
+ // If < 0, the feature is turned off.
+ // Higher values mean more aggressiveness to skip rectangular partition
+ // search that results in better encoding speed but worse coding
+ // performance.
+ int prune_rect_thresh[4];
+ } rd_ml_partition;
+
+ // Fast approximation of vp9_model_rd_from_var_lapndz
+ int simple_model_rd_from_var;
+
+ // Skip a number of expensive mode evaluations for blocks with zero source
+ // variance.
+ int short_circuit_flat_blocks;
+
+ // Skip a number of expensive mode evaluations for blocks with very low
+ // temporal variance. If the low temporal variance flag is set for a block,
+ // do the following:
+ // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32.
+ // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL
+ // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and
+ // 32x16.
+ // 3: Same as (2), but also skip golden zeromv.
+ int short_circuit_low_temp_var;
+
+ // Limits the rd-threshold update for early exit for the newmv-last mode,
+ // for non-rd mode.
+ int limit_newmv_early_exit;
+
+ // Adds a bias against golden reference, for non-rd mode.
+ int bias_golden;
+
+ // Bias to use base mv and skip 1/4 subpel search when use base mv in
+ // enhancement layer.
+ int base_mv_aggressive;
+
+ // Global flag to enable partition copy from the previous frame.
+ int copy_partition_flag;
+
+ // Compute the source sad for every superblock of the frame,
+ // prior to encoding the frame, to be used to bypass some encoder decisions.
+ int use_source_sad;
+
+ int use_simple_block_yrd;
+
+ // If source sad of superblock is high (> adapt_partition_thresh), will switch
+ // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition
+ // based on the nonrd-pickmode).
+ int adapt_partition_source_sad;
+ int adapt_partition_thresh;
+
+ // Enable use of alt-refs in 1 pass VBR.
+ int use_altref_onepass;
+
+ // Enable use of compound prediction, for nonrd_pickmode with nonzero lag.
+ int use_compound_nonrd_pickmode;
+
+ // Always use nonrd_pick_intra for all block sizes on keyframes.
+ int nonrd_keyframe;
+
+ // For SVC: enables use of partition from lower spatial resolution.
+ int svc_use_lowres_part;
+
+ // Flag to indicate process for handling overshoot on slide/scene change,
+ // for real-time CBR mode.
+ OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt;
+
+ // Disable partitioning of 16x16 blocks.
+ int disable_16x16part_nonkey;
+
+ // Allow for disabling golden reference.
+ int disable_golden_ref;
+
+ // Allow sub-pixel search to use interpolation filters with different taps in
+ // order to achieve accurate motion search result.
+ SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+ // Search method used by temporal filtering in full_pixel_motion_search.
+ SEARCH_METHODS temporal_filter_search_method;
+
+ // Use machine learning based partition search.
+ int nonrd_use_ml_partition;
+
+ // Multiplier for base thresold for variance partitioning.
+ int variance_part_thresh_mult;
+
+ // Force subpel motion filter to always use SMOOTH_FILTER.
+ int force_smooth_interpol;
+
+ // For real-time mode: force DC only under intra search when content
+ // does not have high souce SAD.
+ int rt_intra_dc_only_low_content;
+
+ // The encoder has a feature that skips forward transform and quantization
+ // based on a model rd estimation to reduce encoding time.
+ // However, this feature is dangerous since it could lead to bad perceptual
+ // quality. This flag is added to guard the feature.
+ int allow_skip_txfm_ac_dc;
+} SPEED_FEATURES;
+
+struct VP9_COMP;
+
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi,
+ int speed);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi,
+ int speed);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c
new file mode 100644
index 0000000000..3953253dbb
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "vpx_dsp/bitwriter.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_subexp.h"
+
+static const uint8_t update_bits[255] = {
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 0,
+};
+#define MIN_DELP_BITS 5
+
+static int recenter_nonneg(int v, int m) {
+ if (v > (m << 1))
+ return v;
+ else if (v >= m)
+ return ((v - m) << 1);
+ else
+ return ((m - v) << 1) - 1;
+}
+
+static int remap_prob(int v, int m) {
+ int i;
+ static const uint8_t map_table[MAX_PROB - 1] = {
+ // generated by:
+ // map_table[j] = split_index(j, MAX_PROB - 1, MODULUS_PARAM);
+ 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 2, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 4, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88,
+ 89, 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102,
+ 103, 104, 105, 106, 107, 108, 109, 7, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 8, 122, 123, 124, 125, 126, 127, 128, 129, 130,
+ 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
+ 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11,
+ 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171,
+ 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 13, 182, 183, 184, 185,
+ 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, 195, 196, 197, 198, 199,
+ 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, 213,
+ 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+ 228, 229, 17, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
+ 18, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 19,
+ };
+ v--;
+ m--;
+ if ((m << 1) <= MAX_PROB)
+ i = recenter_nonneg(v, m) - 1;
+ else
+ i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
+
+ assert(i >= 0 && (size_t)i < sizeof(map_table));
+ i = map_table[i];
+ return i;
+}
+
+static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
+ int delp = remap_prob(newp, oldp);
+ return update_bits[delp] << VP9_PROB_COST_SHIFT;
+}
+
+static void encode_uniform(vpx_writer *w, int v) {
+ const int l = 8;
+ const int m = (1 << l) - 191;
+ if (v < m) {
+ vpx_write_literal(w, v, l - 1);
+ } else {
+ vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
+ vpx_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
+ vpx_write_literal(w, word >= test, 1);
+ return word >= test;
+}
+
+static void encode_term_subexp(vpx_writer *w, int word) {
+ if (!write_bit_gte(w, word, 16)) {
+ vpx_write_literal(w, word, 4);
+ } else if (!write_bit_gte(w, word, 32)) {
+ vpx_write_literal(w, word - 16, 4);
+ } else if (!write_bit_gte(w, word, 64)) {
+ vpx_write_literal(w, word - 32, 5);
+ } else {
+ encode_uniform(w, word - 64);
+ }
+}
+
+void vp9_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
+ const int delp = remap_prob(newp, oldp);
+ encode_term_subexp(w, delp);
+}
+
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vpx_prob oldp, vpx_prob *bestp,
+ vpx_prob upd) {
+ const int64_t old_b = cost_branch256(ct, oldp);
+ int64_t bestsavings = 0;
+ vpx_prob newp, bestnewp = oldp;
+ const int step = *bestp > oldp ? -1 : 1;
+ const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+
+ if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+ for (newp = *bestp; newp != oldp; newp += step) {
+ const int64_t new_b = cost_branch256(ct, newp);
+ const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+ const int64_t savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = newp;
+ }
+ }
+ }
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vpx_prob oldp,
+ vpx_prob *bestp, vpx_prob upd,
+ int stepsize) {
+ int64_t i, old_b, new_b, update_b, savings, bestsavings;
+ int64_t newp;
+ const int64_t step_sign = *bestp > oldp ? -1 : 1;
+ const int64_t step = stepsize * step_sign;
+ const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+ const vpx_prob *newplist, *oldplist;
+ vpx_prob bestnewp;
+ oldplist = vp9_pareto8_full[oldp - 1];
+ old_b = cost_branch256(ct + 2 * PIVOT_NODE, oldp);
+ for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+ old_b += cost_branch256(ct + 2 * i, oldplist[i - UNCONSTRAINED_NODES]);
+
+ bestsavings = 0;
+ bestnewp = oldp;
+
+ assert(stepsize > 0);
+
+ if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
+ for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
+ if (newp < 1 || newp > 255) continue;
+ newplist = vp9_pareto8_full[newp - 1];
+ new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp);
+ for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
+ new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
+ update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost;
+ savings = old_b - new_b - update_b;
+ if (savings > bestsavings) {
+ bestsavings = savings;
+ bestnewp = (vpx_prob)newp;
+ }
+ }
+ }
+
+ *bestp = bestnewp;
+ return bestsavings;
+}
+
+void vp9_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
+ const unsigned int ct[2]) {
+ const vpx_prob upd = DIFF_UPDATE_PROB;
+ vpx_prob newp = get_binary_prob(ct[0], ct[1]);
+ const int64_t savings =
+ vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
+ assert(newp >= 1);
+ if (savings > 0) {
+ vpx_write(w, 1, upd);
+ vp9_write_prob_diff_update(w, newp, *oldp);
+ *oldp = newp;
+ } else {
+ vpx_write(w, 0, upd);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h
new file mode 100644
index 0000000000..2d016d24c5
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_subexp.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_
+#define VPX_VP9_ENCODER_VP9_SUBEXP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx_dsp/prob.h"
+
+struct vpx_writer;
+
+void vp9_write_prob_diff_update(struct vpx_writer *w, vpx_prob newp,
+ vpx_prob oldp);
+
+void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+ const unsigned int ct[2]);
+
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+ vpx_prob oldp, vpx_prob *bestp,
+ vpx_prob upd);
+
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+ const vpx_prob oldp,
+ vpx_prob *bestp, vpx_prob upd,
+ int stepsize);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_SUBEXP_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c
new file mode 100644
index 0000000000..e4721271d9
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#define SMALL_FRAME_WIDTH 32
+#define SMALL_FRAME_HEIGHT 16
+
+static void swap_ptr(void *a, void *b) {
+ void **a_p = (void **)a;
+ void **b_p = (void **)b;
+ void *c = *a_p;
+ *a_p = *b_p;
+ *b_p = c;
+}
+
+void vp9_init_layer_context(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ int mi_rows = cpi->common.mi_rows;
+ int mi_cols = cpi->common.mi_cols;
+ int sl, tl, i;
+ int alt_ref_idx = svc->number_spatial_layers;
+
+ svc->spatial_layer_id = 0;
+ svc->temporal_layer_id = 0;
+ svc->force_zero_mode_spatial_ref = 0;
+ svc->use_base_mv = 0;
+ svc->use_partition_reuse = 0;
+ svc->use_gf_temporal_ref = 1;
+ svc->use_gf_temporal_ref_current_layer = 0;
+ svc->scaled_temp_is_alloc = 0;
+ svc->scaled_one_half = 0;
+ svc->current_superframe = 0;
+ svc->non_reference_frame = 0;
+ svc->skip_enhancement_layer = 0;
+ svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON;
+ svc->framedrop_mode = CONSTRAINED_LAYER_DROP;
+ svc->set_intra_only_frame = 0;
+ svc->previous_frame_is_intra_only = 0;
+ svc->superframe_has_layer_sync = 0;
+ svc->use_set_ref_frame_config = 0;
+ svc->num_encoded_top_layer = 0;
+ svc->simulcast_mode = 0;
+ svc->single_layer_svc = 0;
+ svc->resize_set = 0;
+
+ for (i = 0; i < REF_FRAMES; ++i) {
+ svc->fb_idx_spatial_layer_id[i] = 0xff;
+ svc->fb_idx_temporal_layer_id[i] = 0xff;
+ svc->fb_idx_base[i] = 0;
+ }
+ for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+ svc->last_layer_dropped[sl] = 0;
+ svc->drop_spatial_layer[sl] = 0;
+ svc->ext_frame_flags[sl] = 0;
+ svc->lst_fb_idx[sl] = 0;
+ svc->gld_fb_idx[sl] = 1;
+ svc->alt_fb_idx[sl] = 2;
+ svc->downsample_filter_type[sl] = BILINEAR;
+ svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter.
+ svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
+ svc->fb_idx_upd_tl0[sl] = -1;
+ svc->drop_count[sl] = 0;
+ svc->spatial_layer_sync[sl] = 0;
+ svc->force_drop_constrained_from_above[sl] = 0;
+ }
+ svc->max_consec_drop = INT_MAX;
+
+ svc->buffer_gf_temporal_ref[1].idx = 7;
+ svc->buffer_gf_temporal_ref[0].idx = 6;
+ svc->buffer_gf_temporal_ref[1].is_used = 0;
+ svc->buffer_gf_temporal_ref[0].is_used = 0;
+
+ if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
+ if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH,
+ SMALL_FRAME_HEIGHT, cpi->common.subsampling_x,
+ cpi->common.subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cpi->common.use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS,
+ cpi->common.byte_alignment, NULL, NULL, NULL))
+ vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate empty frame for multiple frame "
+ "contexts");
+
+ memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+ cpi->svc.empty_frame.img.buffer_alloc_sz);
+ }
+
+ for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+ for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+ lc->current_video_frame_in_layer = 0;
+ lc->layer_size = 0;
+ lc->frames_from_key_frame = 0;
+ lc->last_frame_type = FRAME_TYPES;
+ lrc->ni_av_qi = oxcf->worst_allowed_q;
+ lrc->total_actual_bits = 0;
+ lrc->total_target_vs_actual = 0;
+ lrc->ni_tot_qi = 0;
+ lrc->tot_q = 0.0;
+ lrc->avg_q = 0.0;
+ lrc->ni_frames = 0;
+ lrc->decimation_count = 0;
+ lrc->decimation_factor = 0;
+ lrc->worst_quality = oxcf->worst_allowed_q;
+ lrc->best_quality = oxcf->best_allowed_q;
+
+ for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ lrc->rate_correction_factors[i] = 1.0;
+ }
+
+ if (cpi->oxcf.rc_mode == VPX_CBR) {
+ lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+ lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+ lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+ lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+ } else {
+ lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+ lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+ lrc->last_q[INTER_FRAME] = oxcf->best_allowed_q;
+ lrc->avg_frame_qindex[KEY_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ lrc->avg_frame_qindex[INTER_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ if (oxcf->ss_enable_auto_arf[sl])
+ lc->alt_ref_idx = alt_ref_idx++;
+ else
+ lc->alt_ref_idx = INVALID_IDX;
+ lc->gold_ref_idx = INVALID_IDX;
+ }
+
+ lrc->buffer_level =
+ oxcf->starting_buffer_level_ms * lc->target_bandwidth / 1000;
+ lrc->bits_off_target = lrc->buffer_level;
+
+ // Initialize the cyclic refresh parameters. If spatial layers are used
+ // (i.e., ss_number_layers > 1), these need to be updated per spatial
+ // layer.
+ // Cyclic refresh is only applied on base temporal layer.
+ if (oxcf->ss_number_layers > 1 && tl == 0) {
+ size_t last_coded_q_map_size;
+ size_t consec_zero_mv_size;
+ VP9_COMMON *const cm = &cpi->common;
+ lc->sb_index = 0;
+ lc->actual_num_seg1_blocks = 0;
+ lc->actual_num_seg2_blocks = 0;
+ lc->counter_encode_maxq_scene_change = 0;
+ CHECK_MEM_ERROR(&cm->error, lc->map,
+ vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
+ memset(lc->map, 0, mi_rows * mi_cols);
+ last_coded_q_map_size =
+ mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
+ CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
+ vpx_malloc(last_coded_q_map_size));
+ assert(MAXQ <= 255);
+ memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
+ consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
+ CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
+ vpx_malloc(consec_zero_mv_size));
+ memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
+ }
+ }
+ }
+
+ // Still have extra buffer for base layer golden frame
+ if (!(svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) &&
+ alt_ref_idx < REF_FRAMES)
+ svc->layer_context[0].gold_ref_idx = alt_ref_idx;
+}
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
+ const int target_bandwidth) {
+ SVC *const svc = &cpi->svc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int sl, tl, layer = 0, spatial_layer_target;
+ float bitrate_alloc = 1.0;
+ int num_spatial_layers_nonzero_rate = 0;
+
+ cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+ for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+ for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+ layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+ svc->layer_context[layer].target_bandwidth =
+ oxcf->layer_target_bitrate[layer];
+ }
+
+ layer = LAYER_IDS_TO_IDX(
+ sl,
+ ((oxcf->ts_number_layers - 1) < 0 ? 0 : (oxcf->ts_number_layers - 1)),
+ oxcf->ts_number_layers);
+ spatial_layer_target = svc->layer_context[layer].target_bandwidth =
+ oxcf->layer_target_bitrate[layer];
+
+ for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+ LAYER_CONTEXT *const lc =
+ &svc->layer_context[sl * oxcf->ts_number_layers + tl];
+ RATE_CONTROL *const lrc = &lc->rc;
+
+ lc->spatial_layer_target_bandwidth = spatial_layer_target;
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
+ lrc->starting_buffer_level =
+ (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+ lrc->optimal_buffer_level =
+ (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+ lrc->maximum_buffer_size =
+ (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+ lrc->bits_off_target =
+ VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+ lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+ lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
+ lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+ lrc->worst_quality = rc->worst_quality;
+ lrc->best_quality = rc->best_quality;
+ }
+ }
+ } else {
+ int layer_end;
+
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+ layer_end = svc->number_temporal_layers;
+ } else {
+ layer_end = svc->number_spatial_layers;
+ }
+
+ for (layer = 0; layer < layer_end; ++layer) {
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ RATE_CONTROL *const lrc = &lc->rc;
+
+ lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
+
+ if (target_bandwidth != 0) {
+ bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+ }
+ // Update buffer-related quantities.
+ lrc->starting_buffer_level =
+ (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+ lrc->optimal_buffer_level =
+ (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+ lrc->maximum_buffer_size =
+ (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+ lrc->bits_off_target =
+ VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+ lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+ // Update framerate-related quantities.
+ if (svc->number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) {
+ lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[layer];
+ } else {
+ lc->framerate = cpi->framerate;
+ }
+ lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
+ // Update qp-related quantities.
+ lrc->worst_quality = rc->worst_quality;
+ lrc->best_quality = rc->best_quality;
+ }
+ }
+ for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+ // Check bitrate of spatia layer.
+ layer = LAYER_IDS_TO_IDX(sl, oxcf->ts_number_layers - 1,
+ oxcf->ts_number_layers);
+ if (oxcf->layer_target_bitrate[layer] > 0)
+ num_spatial_layers_nonzero_rate += 1;
+ }
+ if (num_spatial_layers_nonzero_rate == 1)
+ svc->single_layer_svc = 1;
+ else
+ svc->single_layer_svc = 0;
+}
+
+static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
+ if (is_one_pass_svc(cpi))
+ return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id];
+ else
+ return (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR)
+ ? &cpi->svc.layer_context[cpi->svc.temporal_layer_id]
+ : &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
+}
+
+void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ RATE_CONTROL *const lrc = &lc->rc;
+ // Index into spatial+temporal arrays.
+ const int st_idx = svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id;
+ const int tl = svc->temporal_layer_id;
+
+ lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
+ lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
+ // Update the average layer frame size (non-cumulative per-frame-bw).
+ if (tl == 0) {
+ lc->avg_frame_size = lrc->avg_frame_bandwidth;
+ } else {
+ const double prev_layer_framerate =
+ cpi->framerate / oxcf->ts_rate_decimator[tl - 1];
+ const int prev_layer_target_bandwidth =
+ oxcf->layer_target_bitrate[st_idx - 1];
+ lc->avg_frame_size =
+ (int)round((lc->target_bandwidth - prev_layer_target_bandwidth) /
+ (lc->framerate - prev_layer_framerate));
+ }
+}
+
+void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ RATE_CONTROL *const lrc = &lc->rc;
+
+ lc->framerate = framerate;
+ lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+ lrc->min_frame_bandwidth =
+ (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+ lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
+ oxcf->two_pass_vbrmax_section) /
+ 100);
+ vp9_rc_set_gf_interval_range(cpi, lrc);
+}
+
+void vp9_restore_layer_context(VP9_COMP *const cpi) {
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+ const int old_frame_since_key = cpi->rc.frames_since_key;
+ const int old_frame_to_key = cpi->rc.frames_to_key;
+ const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop;
+
+ cpi->rc = lc->rc;
+ cpi->twopass = lc->twopass;
+ cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+ cpi->alt_ref_source = lc->alt_ref_source;
+ // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
+ // does not use speed = 0).
+ if (is_one_pass_svc(cpi) && lc->speed > 0) {
+ cpi->oxcf.speed = lc->speed;
+ }
+ cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
+ // Reset the frames_since_key and frames_to_key counters to their values
+ // before the layer restore. Keep these defined for the stream (not layer).
+ if (cpi->svc.number_temporal_layers > 1 ||
+ cpi->svc.number_spatial_layers > 1) {
+ cpi->rc.frames_since_key = old_frame_since_key;
+ cpi->rc.frames_to_key = old_frame_to_key;
+ }
+ cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop;
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ swap_ptr(&cr->map, &lc->map);
+ swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
+ swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv);
+ cr->sb_index = lc->sb_index;
+ cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+ cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+ cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
+ }
+}
+
+void vp9_save_layer_context(VP9_COMP *const cpi) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ LAYER_CONTEXT *const lc = get_layer_context(cpi);
+
+ lc->rc = cpi->rc;
+ lc->twopass = cpi->twopass;
+ lc->target_bandwidth = (int)oxcf->target_bandwidth;
+ lc->alt_ref_source = cpi->alt_ref_source;
+ lc->frame_qp = cpi->common.base_qindex;
+ lc->MBs = cpi->common.MBs;
+
+ // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
+ // for the base temporal layer.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+ cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) {
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ signed char *temp = lc->map;
+ uint8_t *temp2 = lc->last_coded_q_map;
+ uint8_t *temp3 = lc->consec_zero_mv;
+ lc->map = cr->map;
+ cr->map = temp;
+ lc->last_coded_q_map = cr->last_coded_q_map;
+ cr->last_coded_q_map = temp2;
+ lc->consec_zero_mv = cpi->consec_zero_mv;
+ cpi->consec_zero_mv = temp3;
+ lc->sb_index = cr->sb_index;
+ lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+ lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+ lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+ lc->qindex_delta[0] = cr->qindex_delta[0];
+ lc->qindex_delta[1] = cr->qindex_delta[1];
+ lc->qindex_delta[2] = cr->qindex_delta[2];
+ }
+}
+
+#if !CONFIG_REALTIME_ONLY
+void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
+ SVC *const svc = &cpi->svc;
+ int i;
+
+ for (i = 0; i < svc->number_spatial_layers; ++i) {
+ TWO_PASS *const twopass = &svc->layer_context[i].twopass;
+
+ svc->spatial_layer_id = i;
+ vp9_init_second_pass(cpi);
+
+ twopass->total_stats.spatial_layer_id = i;
+ twopass->total_left_stats.spatial_layer_id = i;
+ }
+ svc->spatial_layer_id = 0;
+}
+#endif // !CONFIG_REALTIME_ONLY
+
+void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
+ LAYER_CONTEXT *const lc =
+ &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers];
+ ++lc->current_video_frame_in_layer;
+ ++lc->frames_from_key_frame;
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+ ++cpi->svc.current_superframe;
+}
+
+void get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out) {
+ int w, h;
+
+ if (width_out == NULL || height_out == NULL || den == 0) return;
+
+ w = width_org * num / den;
+ h = height_org * num / den;
+
+ // make height and width even to make chrome player happy
+ w += w % 2;
+ h += h % 2;
+
+ *width_out = w;
+ *height_out = h;
+}
+
+static void reset_fb_idx_unused(VP9_COMP *const cpi) {
+ // If a reference frame is not referenced or refreshed, then set the
+ // fb_idx for that reference to the first one used/referenced.
+ // This is to avoid setting fb_idx for a reference to a slot that is not
+ // used/needed (i.e., since that reference is not referenced or refreshed).
+ MV_REFERENCE_FRAME ref_frame;
+ MV_REFERENCE_FRAME first_ref = 0;
+ int first_fb_idx = 0;
+ int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+ first_ref = ref_frame;
+ first_fb_idx = fb_idx[ref_frame - 1];
+ break;
+ }
+ }
+ if (first_ref > 0) {
+ if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) &&
+ !cpi->ext_refresh_last_frame)
+ cpi->lst_fb_idx = first_fb_idx;
+ else if (first_ref != GOLDEN_FRAME &&
+ !(cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
+ !cpi->ext_refresh_golden_frame)
+ cpi->gld_fb_idx = first_fb_idx;
+ else if (first_ref != ALTREF_FRAME &&
+ !(cpi->ref_frame_flags & VP9_ALT_FLAG) &&
+ !cpi->ext_refresh_alt_ref_frame)
+ cpi->alt_fb_idx = first_fb_idx;
+ }
+}
+
+// Never refresh any reference frame buffers on top temporal layers in
+// simulcast mode, which has interlayer prediction disabled.
+static void non_reference_frame_simulcast(VP9_COMP *const cpi) {
+ if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 &&
+ cpi->svc.temporal_layer_id > 0) {
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 0;
+ cpi->ext_refresh_alt_ref_frame = 0;
+ }
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
+ int frame_num_within_temporal_struct = 0;
+ int spatial_id, temporal_id;
+ spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+ frame_num_within_temporal_struct =
+ cpi->svc
+ .layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers]
+ .current_video_frame_in_layer %
+ 4;
+ temporal_id = cpi->svc.temporal_layer_id =
+ (frame_num_within_temporal_struct & 1)
+ ? 2
+ : (frame_num_within_temporal_struct >> 1);
+ cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+ cpi->ext_refresh_alt_ref_frame = 0;
+ if (!temporal_id) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_last_frame = 1;
+ if (!spatial_id) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+ // base layer is a key frame.
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 1;
+ } else {
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ } else if (temporal_id == 1) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ if (!spatial_id) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else {
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ } else {
+ if (frame_num_within_temporal_struct == 1) {
+ // the first tl2 picture
+ if (spatial_id == cpi->svc.number_spatial_layers - 1) { // top layer
+ cpi->ext_refresh_frame_flags_pending = 1;
+ if (!spatial_id)
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ else
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ } else if (!spatial_id) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else if (spatial_id < cpi->svc.number_spatial_layers - 1) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ } else {
+ // The second tl2 picture
+ if (spatial_id == cpi->svc.number_spatial_layers - 1) { // top layer
+ cpi->ext_refresh_frame_flags_pending = 1;
+ if (!spatial_id)
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ else
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ } else if (!spatial_id) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ } else { // top layer
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ }
+ }
+ }
+ if (temporal_id == 0) {
+ cpi->lst_fb_idx = spatial_id;
+ if (spatial_id) {
+ if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+ cpi->lst_fb_idx = spatial_id - 1;
+ cpi->gld_fb_idx = spatial_id;
+ } else {
+ cpi->gld_fb_idx = spatial_id - 1;
+ }
+ } else {
+ cpi->gld_fb_idx = 0;
+ }
+ cpi->alt_fb_idx = 0;
+ } else if (temporal_id == 1) {
+ cpi->lst_fb_idx = spatial_id;
+ cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+ cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+ } else if (frame_num_within_temporal_struct == 1) {
+ cpi->lst_fb_idx = spatial_id;
+ cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+ cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+ } else {
+ cpi->lst_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+ cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+ cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+ }
+
+ if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+ reset_fb_idx_unused(cpi);
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 2 - that does 0-1-0-1 temporal layering
+// scheme.
+static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
+ int spatial_id, temporal_id;
+ spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+ temporal_id = cpi->svc.temporal_layer_id =
+ cpi->svc
+ .layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers]
+ .current_video_frame_in_layer &
+ 1;
+ cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+ cpi->ext_refresh_alt_ref_frame = 0;
+ if (!temporal_id) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_last_frame = 1;
+ if (!spatial_id) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+ // base layer is a key frame.
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 1;
+ } else {
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ } else if (temporal_id == 1) {
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ if (!spatial_id) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else {
+ if (spatial_id == cpi->svc.number_spatial_layers - 1)
+ cpi->ext_refresh_alt_ref_frame = 0;
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ }
+
+ if (temporal_id == 0) {
+ cpi->lst_fb_idx = spatial_id;
+ if (spatial_id) {
+ if (cpi->svc.layer_context[temporal_id].is_key_frame) {
+ cpi->lst_fb_idx = spatial_id - 1;
+ cpi->gld_fb_idx = spatial_id;
+ } else {
+ cpi->gld_fb_idx = spatial_id - 1;
+ }
+ } else {
+ cpi->gld_fb_idx = 0;
+ }
+ cpi->alt_fb_idx = 0;
+ } else if (temporal_id == 1) {
+ cpi->lst_fb_idx = spatial_id;
+ cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
+ cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
+ }
+
+ if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+ reset_fb_idx_unused(cpi);
+}
+
+// The function sets proper ref_frame_flags, buffer indices, and buffer update
+// variables for temporal layering mode 0 - that has no temporal layering.
+static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
+ VP9_COMP *const cpi) {
+ int spatial_id;
+ spatial_id = cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
+ cpi->ext_refresh_last_frame = cpi->ext_refresh_golden_frame =
+ cpi->ext_refresh_alt_ref_frame = 0;
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->ext_refresh_last_frame = 1;
+ if (!spatial_id) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ } else if (cpi->svc.layer_context[0].is_key_frame) {
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 1;
+ } else {
+ cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ }
+ cpi->lst_fb_idx = spatial_id;
+ if (spatial_id) {
+ if (cpi->svc.layer_context[0].is_key_frame) {
+ cpi->lst_fb_idx = spatial_id - 1;
+ cpi->gld_fb_idx = spatial_id;
+ } else {
+ cpi->gld_fb_idx = spatial_id - 1;
+ }
+ } else {
+ cpi->gld_fb_idx = 0;
+ }
+
+ if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+ reset_fb_idx_unused(cpi);
+}
+
+static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
+ VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode;
+ cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl];
+ cpi->ext_refresh_frame_flags_pending = 1;
+ cpi->lst_fb_idx = svc->lst_fb_idx[sl];
+ cpi->gld_fb_idx = svc->gld_fb_idx[sl];
+ cpi->alt_fb_idx = svc->alt_fb_idx[sl];
+ cpi->ext_refresh_last_frame = 0;
+ cpi->ext_refresh_golden_frame = 0;
+ cpi->ext_refresh_alt_ref_frame = 0;
+ cpi->ref_frame_flags = 0;
+ if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+ if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+ if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG;
+}
+
+void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ int sl = svc->spatial_layer_id;
+ svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
+ svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
+ svc->alt_fb_idx[sl] = cpi->alt_fb_idx;
+ // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the
+ // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api.
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ int ref;
+ for (ref = 0; ref < REF_FRAMES; ++ref) {
+ svc->update_buffer_slot[sl] &= ~(1 << ref);
+ if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) ||
+ (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) ||
+ (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame))
+ svc->update_buffer_slot[sl] |= (1 << ref);
+ }
+ }
+
+ // TODO(jianj): Remove these 3, deprecated.
+ svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame;
+ svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
+ svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
+
+ svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+ svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+ svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
+}
+
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
+ int width = 0, height = 0;
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *lc = NULL;
+ int scaling_factor_num = 1;
+ int scaling_factor_den = 1;
+ svc->skip_enhancement_layer = 0;
+
+ if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF &&
+ svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 &&
+ svc->number_temporal_layers <= 3)
+ svc->simulcast_mode = 1;
+ else
+ svc->simulcast_mode = 0;
+
+ if (svc->number_spatial_layers > 1) {
+ svc->use_base_mv = 1;
+ svc->use_partition_reuse = 1;
+ }
+ svc->force_zero_mode_spatial_ref = 1;
+ svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride;
+ svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows;
+ svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols;
+
+ // For constrained_from_above drop mode: before encoding superframe (i.e.,
+ // at SL0 frame) check all spatial layers (starting from top) for possible
+ // drop, and if so, set a flag to force drop of that layer and all its lower
+ // layers.
+ if (svc->spatial_layer_to_encode == svc->first_spatial_layer_to_encode) {
+ int sl;
+ for (sl = 0; sl < svc->number_spatial_layers; sl++)
+ svc->force_drop_constrained_from_above[sl] = 0;
+ if (svc->framedrop_mode == CONSTRAINED_FROM_ABOVE_DROP) {
+ for (sl = svc->number_spatial_layers - 1;
+ sl >= svc->first_spatial_layer_to_encode; sl--) {
+ int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
+ LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+ cpi->rc = sl_lc->rc;
+ cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
+ if (vp9_test_drop(cpi)) {
+ int sl2;
+ // Set flag to force drop in encoding for this mode.
+ for (sl2 = sl; sl2 >= svc->first_spatial_layer_to_encode; sl2--)
+ svc->force_drop_constrained_from_above[sl2] = 1;
+ break;
+ }
+ }
+ }
+ }
+
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+ set_flags_and_fb_idx_for_temporal_mode3(cpi);
+ } else if (svc->temporal_layering_mode ==
+ VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+ set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+ } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+ set_flags_and_fb_idx_for_temporal_mode2(cpi);
+ } else if (svc->temporal_layering_mode ==
+ VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->use_set_ref_frame_config) {
+ set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi);
+ }
+
+ if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+ cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+ cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx)
+ svc->buffer_gf_temporal_ref[0].is_used = 1;
+ if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+ cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+ cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx)
+ svc->buffer_gf_temporal_ref[1].is_used = 1;
+
+ // For the fixed (non-flexible/bypass) SVC mode:
+ // If long term temporal reference is enabled at the sequence level
+ // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames),
+ // we can use golden as a second temporal reference
+ // (since the spatial/inter-layer reference is disabled).
+ // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is
+ // unused (slot 7 and 6 should be available for 3-3 layer system).
+ // For now usage of this second temporal reference will only be used for
+ // highest and next to highest spatial layer (i.e., top and middle layer for
+ // 3 spatial layers).
+ svc->use_gf_temporal_ref_current_layer = 0;
+ if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used &&
+ !svc->buffer_gf_temporal_ref[1].is_used &&
+ svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON &&
+ svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 &&
+ svc->spatial_layer_id >= svc->number_spatial_layers - 2) {
+ // Enable the second (long-term) temporal reference at the frame-level.
+ svc->use_gf_temporal_ref_current_layer = 1;
+ }
+
+ // Check if current superframe has any layer sync, only check once on
+ // base layer.
+ if (svc->spatial_layer_id == 0) {
+ int sl = 0;
+ // Default is no sync.
+ svc->superframe_has_layer_sync = 0;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1;
+ }
+ }
+
+ // Reset the drop flags for all spatial layers, on the
+ // first_spatial_layer_to_encode.
+ if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+ vp9_zero(svc->drop_spatial_layer);
+ // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx
+ // causes an issue with frame dropping and temporal layers, when the frame
+ // flags are passed via the encode call (bypass mode). Issue is that we're
+ // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx));
+ memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx));
+ memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx));
+ // These are set by API before the superframe is encoded and they are
+ // passed to encoder layer by layer. Don't reset them on layer 0 in bypass
+ // mode.
+ vp9_zero(svc->update_buffer_slot);
+ vp9_zero(svc->reference_last);
+ vp9_zero(svc->reference_golden);
+ vp9_zero(svc->reference_altref);
+ // TODO(jianj): Remove these 3, deprecated.
+ vp9_zero(svc->update_last);
+ vp9_zero(svc->update_golden);
+ vp9_zero(svc->update_altref);
+ }
+ }
+
+ lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+ svc->temporal_layer_id];
+
+ // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS,
+ // only for non-BYPASS mode for now.
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
+ svc->use_set_ref_frame_config) {
+ RATE_CONTROL *const lrc = &lc->rc;
+ lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
+ lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+ if (cpi->fixed_qp_onepass) {
+ lrc->worst_quality = cpi->rc.worst_quality;
+ lrc->best_quality = cpi->rc.best_quality;
+ }
+ }
+
+ if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
+ svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
+ cpi->resize_state != ORIG) {
+ scaling_factor_num = lc->scaling_factor_num_resize;
+ scaling_factor_den = lc->scaling_factor_den_resize;
+ } else {
+ scaling_factor_num = lc->scaling_factor_num;
+ scaling_factor_den = lc->scaling_factor_den;
+ }
+
+ get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, scaling_factor_num,
+ scaling_factor_den, &width, &height);
+
+ // Use Eightap_smooth for low resolutions.
+ if (width * height <= 320 * 240)
+ svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+ // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel
+ // to source pixel).
+ if (scaling_factor_num > (3 * scaling_factor_den) >> 2)
+ svc->downsample_filter_phase[svc->spatial_layer_id] = 0;
+
+ // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2.
+ // For now, turn off use of base motion vectors and partition reuse if the
+ // spatial scale factors for any layers are not 2,
+ // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
+ // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
+ if (svc->number_spatial_layers > 1) {
+ int sl;
+ for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) {
+ lc = &svc->layer_context[sl * svc->number_temporal_layers +
+ svc->temporal_layer_id];
+ if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
+ !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
+ svc->number_spatial_layers == 3)) {
+ svc->use_base_mv = 0;
+ svc->use_partition_reuse = 0;
+ break;
+ }
+ }
+ // For non-zero spatial layers: if the previous spatial layer was dropped
+ // disable the base_mv and partition_reuse features.
+ if (svc->spatial_layer_id > 0 &&
+ svc->drop_spatial_layer[svc->spatial_layer_id - 1]) {
+ svc->use_base_mv = 0;
+ svc->use_partition_reuse = 0;
+ }
+ }
+
+ svc->non_reference_frame = 0;
+ if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+ !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
+ svc->non_reference_frame = 1;
+ // For flexible mode, where update_buffer_slot is used, need to check if
+ // all buffer slots are not refreshed.
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)
+ svc->non_reference_frame = 0;
+ }
+
+ if (svc->spatial_layer_id == 0) {
+ svc->high_source_sad_superframe = 0;
+ svc->high_num_blocks_with_motion = 0;
+ }
+
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->last_layer_dropped[svc->spatial_layer_id] &&
+ svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 &&
+ !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ // For fixed/non-flexible mode, if the previous frame (same spatial layer
+ // from previous superframe) was dropped, make sure the lst_fb_idx
+ // for this frame corresponds to the buffer index updated on (last) encoded
+ // TL0 frame (with same spatial layer).
+ cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id];
+ }
+
+ if (vp9_set_size_literal(cpi, width, height) != 0)
+ return VPX_CODEC_INVALID_PARAM;
+
+ return 0;
+}
+
+struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
+ struct lookahead_ctx *ctx,
+ int drain) {
+ struct lookahead_entry *buf = NULL;
+ if (ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+ buf = vp9_lookahead_peek(ctx, 0);
+ if (buf != NULL) {
+ // Only remove the buffer when pop the highest layer.
+ if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) {
+ vp9_lookahead_pop(ctx, drain);
+ }
+ }
+ }
+ return buf;
+}
+
+void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
+ int sl, tl;
+ SVC *const svc = &cpi->svc;
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+ for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
+ int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
+ LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+ if (lc->map) vpx_free(lc->map);
+ if (lc->last_coded_q_map) vpx_free(lc->last_coded_q_map);
+ if (lc->consec_zero_mv) vpx_free(lc->consec_zero_mv);
+ }
+ }
+}
+
+// Reset on key frame: reset counters, references and buffer updates.
+void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) {
+ int sl, tl;
+ SVC *const svc = &cpi->svc;
+ LAYER_CONTEXT *lc = NULL;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
+ lc->current_video_frame_in_layer = 0;
+ if (is_key) lc->frames_from_key_frame = 0;
+ }
+ }
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+ set_flags_and_fb_idx_for_temporal_mode3(cpi);
+ } else if (svc->temporal_layering_mode ==
+ VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
+ set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
+ } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
+ set_flags_and_fb_idx_for_temporal_mode2(cpi);
+ }
+ vp9_update_temporal_layer_framerate(cpi);
+ vp9_restore_layer_context(cpi);
+}
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+ SVC *svc = &cpi->svc;
+ int sl, tl;
+ for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+ // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+ const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+ sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
+ RATE_CONTROL *lrc = &lc->rc;
+ if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
+ lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+ // Reset for all temporal layers with spatial layer sl.
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ int temporal_layer_idx =
+ LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+ lrc = &svc->layer_context[temporal_layer_idx].rc;
+ lrc->rc_1_frame = 0;
+ lrc->rc_2_frame = 0;
+ lrc->bits_off_target = lrc->optimal_buffer_level;
+ lrc->buffer_level = lrc->optimal_buffer_level;
+ }
+ }
+ }
+}
+
+void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ const int sl = svc->spatial_layer_id;
+ // Check for disabling inter-layer (spatial) prediction, if
+ // svc.disable_inter_layer_pred is set. If the previous spatial layer was
+ // dropped then disable the prediction from this (scaled) reference.
+ // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled
+ // on key frames or if any spatial layer is a sync layer.
+ if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
+ !svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+ !svc->superframe_has_layer_sync) ||
+ svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
+ svc->drop_spatial_layer[sl - 1]) {
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ if (yv12 != NULL &&
+ (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
+ const struct scale_factors *const scale_fac =
+ &cm->frame_refs[ref_frame - 1].sf;
+ if (vp9_is_scaled(scale_fac)) {
+ cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
+ // Point golden/altref frame buffer index to last.
+ if (!svc->simulcast_mode) {
+ if (ref_frame == GOLDEN_FRAME)
+ cpi->gld_fb_idx = cpi->lst_fb_idx;
+ else if (ref_frame == ALTREF_FRAME)
+ cpi->alt_fb_idx = cpi->lst_fb_idx;
+ }
+ }
+ }
+ }
+ }
+ // For fixed/non-flexible SVC: check for disabling inter-layer prediction.
+ // If the reference for inter-layer prediction (the reference that is scaled)
+ // is not the previous spatial layer from the same superframe, then we disable
+ // inter-layer prediction. Only need to check when inter_layer prediction is
+ // not set to OFF mode.
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) {
+ // We only use LAST and GOLDEN for prediction in real-time mode, so we
+ // check both here.
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+ struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf;
+ if (vp9_is_scaled(scale_fac)) {
+ // If this reference was updated on the previous spatial layer of the
+ // current superframe, then we keep this reference (don't disable).
+ // Otherwise we disable the inter-layer prediction.
+ // This condition is verified by checking if the current frame buffer
+ // index is equal to any of the slots for the previous spatial layer,
+ // and if so, check if that slot was updated/refreshed. If that is the
+ // case, then this reference is valid for inter-layer prediction under
+ // the mode INTER_LAYER_PRED_ON_CONSTRAINED.
+ int fb_idx =
+ ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx;
+ int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG;
+ int disable = 1;
+ if (fb_idx < 0) continue;
+ if ((fb_idx == svc->lst_fb_idx[sl - 1] &&
+ (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+ (fb_idx == svc->gld_fb_idx[sl - 1] &&
+ (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+ (fb_idx == svc->alt_fb_idx[sl - 1] &&
+ (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))))
+ disable = 0;
+ if (disable) cpi->ref_frame_flags &= (~ref_flag);
+ }
+ }
+ }
+}
+
+void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ // For fixed/non-flexible mode, the following constraint are expected,
+ // when inter-layer prediciton is on (default).
+ if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
+ svc->framedrop_mode != LAYER_DROP) {
+ if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ // On non-key frames: LAST is always temporal reference, GOLDEN is
+ // spatial reference.
+ if (svc->temporal_layer_id == 0)
+ // Base temporal only predicts from base temporal.
+ assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0);
+ else
+ // Non-base temporal only predicts from lower temporal layer.
+ assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] <
+ svc->temporal_layer_id);
+ if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG &&
+ svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+ // Non-base spatial only predicts from lower spatial layer with same
+ // temporal_id.
+ assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+ svc->spatial_layer_id - 1);
+ assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+ svc->temporal_layer_id);
+ }
+ } else if (svc->spatial_layer_id > 0 &&
+ svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+ // Only 1 reference for frame whose base is key; reference may be LAST
+ // or GOLDEN, so we check both.
+ if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+ assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] ==
+ svc->spatial_layer_id - 1);
+ assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] ==
+ svc->temporal_layer_id);
+ } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+ svc->spatial_layer_id - 1);
+ assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+ svc->temporal_layer_id);
+ }
+ }
+ } else if (svc->use_gf_temporal_ref_current_layer &&
+ !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ // For the usage of golden as second long term reference: the
+ // temporal_layer_id of that reference must be base temporal layer 0, and
+ // spatial_layer_id of that reference must be same as current
+ // spatial_layer_id. If not, disable feature.
+ // TODO(marpan): Investigate when this can happen, and maybe put this check
+ // and reset in a different place.
+ if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] !=
+ svc->spatial_layer_id ||
+ svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0)
+ svc->use_gf_temporal_ref_current_layer = 0;
+ }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(VP9_COMP *const cpi) {
+ int layer =
+ LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ return denoise_svc(cpi) && !lc->is_key_frame;
+}
+#endif
+
+void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ // Only for superframes whose base is not key, as those are
+ // already sync frames.
+ if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+ if (svc->spatial_layer_id == 0) {
+ // On base spatial layer: if the current superframe has a layer sync then
+ // reset the pattern counters and reset to base temporal layer.
+ if (svc->superframe_has_layer_sync)
+ vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME);
+ }
+ // If the layer sync is set for this current spatial layer then
+ // disable the temporal reference.
+ if (svc->spatial_layer_id > 0 &&
+ svc->spatial_layer_sync[svc->spatial_layer_id]) {
+ cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+ if (svc->use_gf_temporal_ref_current_layer) {
+ int index = svc->spatial_layer_id;
+ // If golden is used as second reference: need to remove it from
+ // prediction, reset refresh period to 0, and update the reference.
+ svc->use_gf_temporal_ref_current_layer = 0;
+ cpi->rc.baseline_gf_interval = 0;
+ cpi->rc.frames_till_gf_update_due = 0;
+ // On layer sync frame we must update the buffer index used for long
+ // term reference. Use the alt_ref since it is not used or updated on
+ // sync frames.
+ if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+ assert(index >= 0);
+ cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+ cpi->ext_refresh_alt_ref_frame = 1;
+ }
+ }
+ }
+}
+
+void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ int i = 0;
+ // Update the usage of frame buffer index for base spatial layers.
+ if (svc->spatial_layer_id == 0) {
+ if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
+ svc->fb_idx_base[cpi->lst_fb_idx] = 1;
+ if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame)
+ svc->fb_idx_base[cpi->gld_fb_idx] = 1;
+ if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
+ svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+ // For bypass/flexible mode: check for refresh slots.
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ for (i = 0; i < REF_FRAMES; ++i)
+ if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1;
+ }
+ }
+}
+
+static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) {
+ // For non-flexible/bypass SVC mode: check for refreshing other buffer
+ // slots.
+ SVC *const svc = &cpi->svc;
+ VP9_COMMON *const cm = &cpi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ int i;
+ for (i = 0; i < REF_FRAMES; i++) {
+ if ((cm->frame_type == KEY_FRAME && !svc->simulcast_mode) ||
+ svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+ svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+ svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+ }
+ }
+}
+
+void vp9_svc_update_ref_frame(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ BufferPool *const pool = cm->buffer_pool;
+
+ if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ svc->use_set_ref_frame_config) {
+ vp9_svc_update_ref_frame_bypass_mode(cpi);
+ } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) {
+ // Keep track of frame index for each reference frame.
+ int i;
+ // On key frame update all reference frame slots.
+ for (i = 0; i < REF_FRAMES; i++) {
+ svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+ svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+ // LAST/GOLDEN/ALTREF is already updated above.
+ if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx)
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+ }
+ } else {
+ if (cpi->refresh_last_frame) {
+ svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id;
+ svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id;
+ }
+ if (cpi->refresh_golden_frame) {
+ svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id;
+ svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id;
+ }
+ if (cpi->refresh_alt_ref_frame) {
+ svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id;
+ svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id;
+ }
+ }
+ // Copy flags from encoder to SVC struct.
+ vp9_copy_flags_ref_update_idx(cpi);
+ vp9_svc_update_ref_frame_buffer_idx(cpi);
+}
+
+void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) {
+ int64_t this_duration =
+ cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id];
+ vp9_new_framerate(cpi, 10000000.0 / this_duration);
+}
+
+void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ SVC *const svc = &cpi->svc;
+ RATE_CONTROL *const rc = &cpi->rc;
+ // On key frames in CBR mode: reset the avg_frame_index for base layer
+ // (to level closer to worst_quality) if the overshoot is significant.
+ // Reset it for all temporal layers on base spatial layer.
+ if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+ !svc->simulcast_mode &&
+ rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) {
+ int tl;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ VPXMAX(rc->avg_frame_qindex[INTER_FRAME],
+ (cm->base_qindex + rc->worst_quality) >> 1);
+ for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+ const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers);
+ LAYER_CONTEXT *lc = &svc->layer_context[layer];
+ RATE_CONTROL *lrc = &lc->rc;
+ lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME];
+ }
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h
new file mode 100644
index 0000000000..90dec5e20a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+
+#include "vpx/vpx_encoder.h"
+
+#include "vp9/encoder/vp9_ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+ // Inter-layer prediction is on on all frames.
+ INTER_LAYER_PRED_ON,
+ // Inter-layer prediction is off on all frames.
+ INTER_LAYER_PRED_OFF,
+ // Inter-layer prediction is off on non-key frames and non-sync frames.
+ INTER_LAYER_PRED_OFF_NONKEY,
+ // Inter-layer prediction is on on all frames, but constrained such
+ // that any layer S (> 0) can only predict from previous spatial
+ // layer S-1, from the same superframe.
+ INTER_LAYER_PRED_ON_CONSTRAINED
+} INTER_LAYER_PRED;
+
+typedef struct BUFFER_LONGTERM_REF {
+ int idx;
+ int is_used;
+} BUFFER_LONGTERM_REF;
+
+typedef struct {
+ RATE_CONTROL rc;
+ int target_bandwidth;
+ int spatial_layer_target_bandwidth; // Target for the spatial layer.
+ double framerate;
+ int avg_frame_size;
+ int max_q;
+ int min_q;
+ int scaling_factor_num;
+ int scaling_factor_den;
+ // Scaling factors used for internal resize scaling for single layer SVC.
+ int scaling_factor_num_resize;
+ int scaling_factor_den_resize;
+ TWO_PASS twopass;
+ vpx_fixed_buf_t rc_twopass_stats_in;
+ unsigned int current_video_frame_in_layer;
+ int is_key_frame;
+ int frames_from_key_frame;
+ FRAME_TYPE last_frame_type;
+ struct lookahead_entry *alt_ref_source;
+ int alt_ref_idx;
+ int gold_ref_idx;
+ int has_alt_frame;
+ size_t layer_size;
+ // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+ // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct.
+ int sb_index;
+ signed char *map;
+ uint8_t *last_coded_q_map;
+ uint8_t *consec_zero_mv;
+ int actual_num_seg1_blocks;
+ int actual_num_seg2_blocks;
+ int counter_encode_maxq_scene_change;
+ int qindex_delta[3];
+ uint8_t speed;
+ int loopfilter_ctrl;
+ int frame_qp;
+ int MBs;
+} LAYER_CONTEXT;
+
+typedef struct SVC {
+ int spatial_layer_id;
+ int temporal_layer_id;
+ int number_spatial_layers;
+ int number_temporal_layers;
+
+ int spatial_layer_to_encode;
+
+ // Workaround for multiple frame contexts
+ enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state;
+ struct lookahead_entry empty_frame;
+ int encode_intra_empty_frame;
+
+ // Store scaled source frames to be used for temporal filter to generate
+ // a alt ref frame.
+ YV12_BUFFER_CONFIG scaled_frames[MAX_LAG_BUFFERS];
+ // Temp buffer used for 2-stage down-sampling, for real-time mode.
+ YV12_BUFFER_CONFIG scaled_temp;
+ int scaled_one_half;
+ int scaled_temp_is_alloc;
+
+ // Layer context used for rate control in one pass temporal CBR mode or
+ // two pass spatial mode.
+ LAYER_CONTEXT layer_context[VPX_MAX_LAYERS];
+ // Indicates what sort of temporal layering is used.
+ // Currently, this only works for CBR mode.
+ VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+ // Frame flags and buffer indexes for each spatial layer, set by the
+ // application (external settings).
+ int ext_frame_flags[VPX_MAX_LAYERS];
+ int lst_fb_idx[VPX_MAX_LAYERS];
+ int gld_fb_idx[VPX_MAX_LAYERS];
+ int alt_fb_idx[VPX_MAX_LAYERS];
+ int force_zero_mode_spatial_ref;
+ // Sequence level flag to enable second (long term) temporal reference.
+ int use_gf_temporal_ref;
+ // Frame level flag to enable second (long term) temporal reference.
+ int use_gf_temporal_ref_current_layer;
+ // Allow second reference for at most 2 top highest resolution layers.
+ BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2];
+ int current_superframe;
+ int non_reference_frame;
+ int use_base_mv;
+ int use_partition_reuse;
+ // Used to control the downscaling filter for source scaling, for 1 pass CBR.
+ // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
+ // = 8 will center the target pixel and get a symmetric averaging filter.
+ // downsample_filter_type: 4 filters may be used: eighttap_regular,
+ // eighttap_smooth, eighttap_sharp, and bilinear.
+ INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
+ int downsample_filter_phase[VPX_SS_MAX_LAYERS];
+
+ BLOCK_SIZE *prev_partition_svc;
+ int mi_stride[VPX_MAX_LAYERS];
+ int mi_rows[VPX_MAX_LAYERS];
+ int mi_cols[VPX_MAX_LAYERS];
+
+ int first_layer_denoise;
+
+ int skip_enhancement_layer;
+
+ int lower_layer_qindex;
+
+ int last_layer_dropped[VPX_MAX_LAYERS];
+ int drop_spatial_layer[VPX_MAX_LAYERS];
+ int framedrop_thresh[VPX_MAX_LAYERS];
+ int drop_count[VPX_MAX_LAYERS];
+ int force_drop_constrained_from_above[VPX_MAX_LAYERS];
+ int max_consec_drop;
+ SVC_LAYER_DROP_MODE framedrop_mode;
+
+ INTER_LAYER_PRED disable_inter_layer_pred;
+
+ // Flag to indicate scene change and high num of motion blocks at current
+ // superframe, scene detection is currently checked for each superframe prior
+ // to encoding, on the full resolution source.
+ int high_source_sad_superframe;
+ int high_num_blocks_with_motion;
+
+ // Flags used to get SVC pattern info.
+ int update_buffer_slot[VPX_SS_MAX_LAYERS];
+ uint8_t reference_last[VPX_SS_MAX_LAYERS];
+ uint8_t reference_golden[VPX_SS_MAX_LAYERS];
+ uint8_t reference_altref[VPX_SS_MAX_LAYERS];
+ // TODO(jianj): Remove these last 3, deprecated.
+ uint8_t update_last[VPX_SS_MAX_LAYERS];
+ uint8_t update_golden[VPX_SS_MAX_LAYERS];
+ uint8_t update_altref[VPX_SS_MAX_LAYERS];
+
+ // Keep track of the frame buffer index updated/refreshed on the base
+ // temporal superframe.
+ int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
+
+ // Keep track of the spatial and temporal layer id of the frame that last
+ // updated the frame buffer index.
+ uint8_t fb_idx_spatial_layer_id[REF_FRAMES];
+ uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
+
+ int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+ // Quantizer for each spatial layer.
+ int base_qindex[VPX_SS_MAX_LAYERS];
+ uint8_t set_intra_only_frame;
+ uint8_t previous_frame_is_intra_only;
+ uint8_t superframe_has_layer_sync;
+
+ uint8_t fb_idx_base[REF_FRAMES];
+
+ int use_set_ref_frame_config;
+
+ int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS];
+
+ int first_spatial_layer_to_encode;
+
+ // Parameters for allowing framerate per spatial layer, and buffer
+ // update based on timestamps.
+ int64_t duration[VPX_SS_MAX_LAYERS];
+ int64_t timebase_fac;
+ int64_t time_stamp_superframe;
+ int64_t time_stamp_prev[VPX_SS_MAX_LAYERS];
+
+ int num_encoded_top_layer;
+
+ // Every spatial layer on a superframe whose base is key is key too.
+ int simulcast_mode;
+
+ // Flag to indicate SVC is dynamically switched to a single layer.
+ int single_layer_svc;
+ int resize_set;
+} SVC;
+
+struct VP9_COMP;
+
+// Initialize layer context data from init_config().
+void vp9_init_layer_context(struct VP9_COMP *const cpi);
+
+// Update the layer context from a change_config() call.
+void vp9_update_layer_context_change_config(struct VP9_COMP *const cpi,
+ const int target_bandwidth);
+
+// Prior to encoding the frame, update framerate-related quantities
+// for the current temporal layer.
+void vp9_update_temporal_layer_framerate(struct VP9_COMP *const cpi);
+
+// Update framerate-related quantities for the current spatial layer.
+void vp9_update_spatial_layer_framerate(struct VP9_COMP *const cpi,
+ double framerate);
+
+// Prior to encoding the frame, set the layer context, for the current layer
+// to be encoded, to the cpi struct.
+void vp9_restore_layer_context(struct VP9_COMP *const cpi);
+
+// Save the layer context after encoding the frame.
+void vp9_save_layer_context(struct VP9_COMP *const cpi);
+
+// Initialize second pass rc for spatial svc.
+void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
+
+void get_layer_resolution(const int width_org, const int height_org,
+ const int num, const int den, int *width_out,
+ int *height_out);
+
+// Increment number of video frames in layer
+void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
+
+// Check if current layer is key frame in spatial upper layer
+int vp9_is_upper_layer_key_frame(const struct VP9_COMP *const cpi);
+
+// Get the next source buffer to encode
+struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
+ struct lookahead_ctx *ctx,
+ int drain);
+
+// Start a frame and initialize svc parameters
+int vp9_svc_start_frame(struct VP9_COMP *const cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
+#endif
+
+void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
+
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
+
+void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
+
+void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key);
+
+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
+void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
+
+void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi);
+
+void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c
new file mode 100644
index 0000000000..986553a4a8
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -0,0 +1,1205 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+
+#include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
+
+static int fixed_divide[512];
+static unsigned int index_mult[14] = { 0, 0, 0, 0, 49152,
+ 39322, 32768, 28087, 24576, 21846,
+ 19661, 17874, 0, 15124 };
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_index_mult[14] = { 0U, 0U, 0U,
+ 0U, 3221225472U, 2576980378U,
+ 2147483648U, 1840700270U, 1610612736U,
+ 1431655766U, 1288490189U, 1171354718U,
+ 0U, 991146300U };
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static void temporal_filter_predictors_mb_c(
+ MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
+ int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
+ uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs,
+ int use_32x32) {
+ const int which_mv = 0;
+ const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
+ int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
+
+ enum mv_precision mv_precision_uv;
+ int uv_stride;
+ if (uv_block_width == (BW >> 1)) {
+ uv_stride = (stride + 1) >> 1;
+ mv_precision_uv = MV_PRECISION_Q4;
+ } else {
+ uv_stride = stride;
+ mv_precision_uv = MV_PRECISION_Q3;
+ }
+#if !CONFIG_VP9_HIGHBITDEPTH
+ (void)xd;
+#endif
+
+ if (use_32x32) {
+ const MV mv = { mv_row, mv_col };
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
+ CONVERT_TO_SHORTPTR(&pred[0]), BW, &mv,
+ scale, BW, BH, which_mv, kernel,
+ MV_PRECISION_Q3, x, y, xd->bd);
+
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x,
+ y, xd->bd);
+
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv,
+ scale, uv_block_width, uv_block_height, which_mv, kernel,
+ mv_precision_uv, x, y, xd->bd);
+ return;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
+ BH, which_mv, kernel, MV_PRECISION_Q3, x, y);
+
+ vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[BLK_PELS],
+ uv_block_width, &mv, scale, uv_block_width,
+ uv_block_height, which_mv, kernel,
+ mv_precision_uv, x, y);
+
+ vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)],
+ uv_block_width, &mv, scale, uv_block_width,
+ uv_block_height, which_mv, kernel,
+ mv_precision_uv, x, y);
+ return;
+ }
+
+ // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
+ // predictors.
+ // Y predictor
+ for (i = 0; i < BH; i += ys) {
+ for (j = 0; j < BW; j += xs) {
+ const MV mv = blk_mvs[k];
+ const int y_offset = i * stride + j;
+ const int p_offset = i * BW + j;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride,
+ CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys,
+ which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd);
+ } else {
+ vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+ BW, &mv, scale, xs, ys, which_mv, kernel,
+ MV_PRECISION_Q3, x, y);
+ }
+#else
+ vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+ BW, &mv, scale, xs, ys, which_mv, kernel,
+ MV_PRECISION_Q3, x, y);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ k++;
+ }
+ }
+
+ // U and V predictors
+ ys = (uv_block_height >> 1);
+ xs = (uv_block_width >> 1);
+ k = 0;
+
+ for (i = 0; i < uv_block_height; i += ys) {
+ for (j = 0; j < uv_block_width; j += xs) {
+ const MV mv = blk_mvs[k];
+ const int uv_offset = i * uv_stride + j;
+ const int p_offset = i * uv_block_width + j;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width,
+ &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y,
+ xd->bd);
+
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride,
+ CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]),
+ uv_block_width, &mv, scale, xs, ys, which_mv, kernel,
+ mv_precision_uv, x, y, xd->bd);
+ } else {
+ vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+ &pred[BLK_PELS + p_offset], uv_block_width,
+ &mv, scale, xs, ys, which_mv, kernel,
+ mv_precision_uv, x, y);
+
+ vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride,
+ &pred[(BLK_PELS << 1) + p_offset],
+ uv_block_width, &mv, scale, xs, ys, which_mv,
+ kernel, mv_precision_uv, x, y);
+ }
+#else
+ vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+ &pred[BLK_PELS + p_offset], uv_block_width, &mv,
+ scale, xs, ys, which_mv, kernel,
+ mv_precision_uv, x, y);
+
+ vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride,
+ &pred[(BLK_PELS << 1) + p_offset],
+ uv_block_width, &mv, scale, xs, ys, which_mv,
+ kernel, mv_precision_uv, x, y);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ k++;
+ }
+ }
+}
+
+void vp9_temporal_filter_init(void) {
+ int i;
+
+ fixed_divide[0] = 0;
+ for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i;
+}
+
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+ int filter_weight) {
+ int mod;
+
+ assert(index >= 0 && index <= 13);
+ assert(index_mult[index] != 0);
+
+ mod =
+ ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+ mod += rounding;
+ mod >>= strength;
+
+ mod = VPXMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int highbd_mod_index(int sum_dist, int index, int rounding,
+ int strength, int filter_weight) {
+ int mod;
+
+ assert(index >= 0 && index <= 13);
+ assert(highbd_index_mult[index] != 0);
+
+ mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32);
+ mod += rounding;
+ mod >>= strength;
+
+ mod = VPXMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int get_filter_weight(unsigned int i, unsigned int j,
+ unsigned int block_height,
+ unsigned int block_width,
+ const int *const blk_fw, int use_32x32) {
+ // blk_fw[0] ~ blk_fw[3] are the same.
+ if (use_32x32) {
+ return blk_fw[0];
+ }
+
+ if (i < block_height / 2) {
+ if (j < block_width / 2) {
+ return blk_fw[0];
+ }
+
+ return blk_fw[1];
+ }
+
+ if (j < block_width / 2) {
+ return blk_fw[2];
+ }
+
+ return blk_fw[3];
+}
+
+void vp9_apply_temporal_filter_c(
+ const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
+ int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
+ int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
+ int uv_buf_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+ uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+ uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
+ unsigned int i, j, k, m;
+ int modifier;
+ const int rounding = (1 << strength) >> 1;
+ const unsigned int uv_block_width = block_width >> ss_x;
+ const unsigned int uv_block_height = block_height >> ss_y;
+ DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
+
+ int idx = 0, idy;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+ memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+ memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+
+ // Calculate diff^2 for each pixel of the 16x16 block.
+ // TODO(yunqing): the following code needs to be optimized.
+ for (i = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++) {
+ const int16_t diff =
+ y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j];
+ y_diff_sse[idx++] = diff * diff;
+ }
+ }
+ idx = 0;
+ for (i = 0; i < uv_block_height; i++) {
+ for (j = 0; j < uv_block_width; j++) {
+ const int16_t diffu =
+ u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j];
+ const int16_t diffv =
+ v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j];
+ u_diff_sse[idx] = diffu * diffu;
+ v_diff_sse[idx] = diffv * diffv;
+ idx++;
+ }
+ }
+
+ for (i = 0, k = 0, m = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++) {
+ const int pixel_value = y_pred[i * y_buf_stride + j];
+ const int filter_weight =
+ get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
+
+ // non-local mean approach
+ int y_index = 0;
+
+ const int uv_r = i >> ss_y;
+ const int uv_c = j >> ss_x;
+ modifier = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ const int row = (int)i + idy;
+ const int col = (int)j + idx;
+
+ if (row >= 0 && row < (int)block_height && col >= 0 &&
+ col < (int)block_width) {
+ modifier += y_diff_sse[row * (int)block_width + col];
+ ++y_index;
+ }
+ }
+ }
+
+ assert(y_index > 0);
+
+ modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
+ modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
+
+ y_index += 2;
+
+ modifier =
+ mod_index(modifier, y_index, rounding, strength, filter_weight);
+
+ y_count[k] += modifier;
+ y_accumulator[k] += modifier * pixel_value;
+
+ ++k;
+
+ // Process chroma component
+ if (!(i & ss_y) && !(j & ss_x)) {
+ const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c];
+ const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
+
+ // non-local mean approach
+ int cr_index = 0;
+ int u_mod = 0, v_mod = 0;
+ int y_diff = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ const int row = uv_r + idy;
+ const int col = uv_c + idx;
+
+ if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
+ col < (int)uv_block_width) {
+ u_mod += u_diff_sse[row * uv_block_width + col];
+ v_mod += v_diff_sse[row * uv_block_width + col];
+ ++cr_index;
+ }
+ }
+ }
+
+ assert(cr_index > 0);
+
+ for (idy = 0; idy < 1 + ss_y; ++idy) {
+ for (idx = 0; idx < 1 + ss_x; ++idx) {
+ const int row = (uv_r << ss_y) + idy;
+ const int col = (uv_c << ss_x) + idx;
+ y_diff += y_diff_sse[row * (int)block_width + col];
+ ++cr_index;
+ }
+ }
+
+ u_mod += y_diff;
+ v_mod += y_diff;
+
+ u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
+ v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
+
+ u_count[m] += u_mod;
+ u_accumulator[m] += u_mod * u_pixel_value;
+ v_count[m] += v_mod;
+ v_accumulator[m] += v_mod * v_pixel_value;
+
+ ++m;
+ } // Complete YUV pixel
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_apply_temporal_filter_c(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+ uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ const int uv_block_width = block_width >> ss_x;
+ const int uv_block_height = block_height >> ss_y;
+ const int y_diff_stride = BW;
+ const int uv_diff_stride = BW;
+
+ DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
+
+ const int rounding = (1 << strength) >> 1;
+
+ // Loop variables
+ int row, col;
+ int uv_row, uv_col;
+ int row_step, col_step;
+
+ memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+ memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+ memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+
+ // Get the square diffs
+ for (row = 0; row < (int)block_height; row++) {
+ for (col = 0; col < (int)block_width; col++) {
+ const int diff =
+ y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+ y_diff_sse[row * y_diff_stride + col] = diff * diff;
+ }
+ }
+
+ for (row = 0; row < uv_block_height; row++) {
+ for (col = 0; col < uv_block_width; col++) {
+ const int u_diff =
+ u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+ const int v_diff =
+ v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+ u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff;
+ v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff;
+ }
+ }
+
+ // Apply the filter to luma
+ for (row = 0; row < (int)block_height; row++) {
+ for (col = 0; col < (int)block_width; col++) {
+ const int filter_weight = get_filter_weight(
+ row, col, block_height, block_width, blk_fw, use_32x32);
+
+ // First we get the modifier for the current y pixel
+ const int y_pixel = y_pre[row * y_pre_stride + col];
+ int y_num_used = 0;
+ int y_mod = 0;
+
+ // Sum the neighboring 3x3 y pixels
+ for (row_step = -1; row_step <= 1; row_step++) {
+ for (col_step = -1; col_step <= 1; col_step++) {
+ const int sub_row = row + row_step;
+ const int sub_col = col + col_step;
+
+ if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+ sub_col < (int)block_width) {
+ y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col];
+ y_num_used++;
+ }
+ }
+ }
+
+ // Sum the corresponding uv pixels to the current y modifier
+ // Note we are rounding down instead of rounding to the nearest pixel.
+ uv_row = row >> ss_y;
+ uv_col = col >> ss_x;
+ y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
+ y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
+
+ y_num_used += 2;
+
+ // Set the modifier
+ y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength,
+ filter_weight);
+
+ // Accumulate the result
+ y_count[row * block_width + col] += y_mod;
+ y_accum[row * block_width + col] += y_mod * y_pixel;
+ }
+ }
+
+ // Apply the filter to chroma
+ for (uv_row = 0; uv_row < uv_block_height; uv_row++) {
+ for (uv_col = 0; uv_col < uv_block_width; uv_col++) {
+ const int y_row = uv_row << ss_y;
+ const int y_col = uv_col << ss_x;
+ const int filter_weight = get_filter_weight(
+ uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+ const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+ const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+ int uv_num_used = 0;
+ int u_mod = 0, v_mod = 0;
+
+ // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+ for (row_step = -1; row_step <= 1; row_step++) {
+ for (col_step = -1; col_step <= 1; col_step++) {
+ const int sub_row = uv_row + row_step;
+ const int sub_col = uv_col + col_step;
+
+ if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+ sub_col < uv_block_width) {
+ u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col];
+ v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col];
+ uv_num_used++;
+ }
+ }
+ }
+
+ // Sum all the luma pixels associated with the current luma pixel
+ for (row_step = 0; row_step < 1 + ss_y; row_step++) {
+ for (col_step = 0; col_step < 1 + ss_x; col_step++) {
+ const int sub_row = y_row + row_step;
+ const int sub_col = y_col + col_step;
+ const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col];
+
+ u_mod += y_diff;
+ v_mod += y_diff;
+ uv_num_used++;
+ }
+ }
+
+ // Set the modifier
+ u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength,
+ filter_weight);
+ v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength,
+ filter_weight);
+
+ // Accumulate the result
+ u_count[uv_row * uv_block_width + uv_col] += u_mod;
+ u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel;
+ v_count[uv_row * uv_block_width + uv_col] += v_mod;
+ v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel;
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static uint32_t temporal_filter_find_matching_mb_c(
+ VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf,
+ uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs,
+ int *blk_bestsme) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const SEARCH_METHODS search_method = MESH;
+ const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ uint32_t bestsme = UINT_MAX;
+ uint32_t distortion;
+ uint32_t sse;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ // Save input state
+ struct buf_2d src = x->plane[0].src;
+ struct buf_2d pre = xd->plane[0].pre[0];
+ int i, j, k = 0;
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = arf_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = frame_ptr_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param,
+ search_method, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, ref_mv, 0, 0);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
+ // calculation. The start full mv and the search result are stored in
+ // ref_mv.
+ bestsme = cpi->find_fractional_mv_step(
+ x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW,
+ BH, USE_8_TAPS_SHARP);
+
+ // DO motion search on 4 16x16 sub_blocks.
+ best_ref_mv1.row = ref_mv->row;
+ best_ref_mv1.col = ref_mv->col;
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ for (i = 0; i < BH; i += SUB_BH) {
+ for (j = 0; j < BW; j += SUB_BW) {
+ // Setup frame pointers
+ x->plane[0].src.buf = arf_frame_buf + i * stride + j;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
+ xd->plane[0].pre[0].stride = stride;
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+ vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
+ step_param, search_method_16, sadpb,
+ cond_cost_list(cpi, cost_list), &best_ref_mv1,
+ &blk_mvs[k], 0, 0);
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ blk_bestsme[k] = cpi->find_fractional_mv_step(
+ x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0,
+ mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL,
+ NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP);
+ k++;
+ }
+ }
+
+ // Restore input state
+ x->plane[0].src = src;
+ xd->plane[0].pre[0] = pre;
+
+ return bestsme;
+}
+
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+ int mb_row, int mb_col_start,
+ int mb_col_end) {
+ ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
+ YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+ int frame_count = arnr_filter_data->frame_count;
+ int alt_ref_index = arnr_filter_data->alt_ref_index;
+ int strength = arnr_filter_data->strength;
+ struct scale_factors *scale = &arnr_filter_data->sf;
+ int byte;
+ int frame;
+ int mb_col;
+ int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
+ int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
+ DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
+ MACROBLOCKD *mbd = &td->mb.e_mbd;
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+ uint8_t *dst1, *dst2;
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]);
+ DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]);
+ uint8_t *predictor;
+#else
+ DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]);
+#endif
+ const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
+ const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
+ // Addition of the tile col level offsets
+ int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start;
+ int mb_uv_offset =
+ mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ } else {
+ predictor = predictor8;
+ }
+#endif
+
+ // Source frames are extended to 16 pixels. This is different than
+ // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS)
+ // A 6/8 tap filter is used for motion search. This requires 2 pixels
+ // before and 3 pixels after. So the largest Y mv on a border would
+ // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the
+ // Y and therefore only extended by 8. The largest mv that a UV block
+ // can support is 8 - VP9_INTERP_EXTEND. A UV mv is half of a Y mv.
+ // (16 - VP9_INTERP_EXTEND) >> 1 which is greater than
+ // 8 - VP9_INTERP_EXTEND.
+ // To keep the mv in play for both Y and UV planes the max that it
+ // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
+ td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND));
+ td->mb.mv_limits.row_max =
+ ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND);
+
+ for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) {
+ int i, j, k;
+ int stride;
+ MV ref_mv;
+
+ vp9_zero_array(accumulator, BLK_PELS * 3);
+ vp9_zero_array(count, BLK_PELS * 3);
+
+ td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND));
+ td->mb.mv_limits.col_max =
+ ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND);
+
+ if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+ unsigned int src_variance;
+ struct buf_2d src;
+
+ src.buf = f->y_buffer + mb_y_offset;
+ src.stride = f->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ src_variance =
+ vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd);
+ } else {
+ src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
+ }
+#else
+ src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (src_variance <= 2) {
+ strength = VPXMAX(0, arnr_filter_data->strength - 2);
+ }
+ }
+
+ for (frame = 0; frame < frame_count; frame++) {
+ // MVs for 4 16x16 sub blocks.
+ MV blk_mvs[4];
+ // Filter weights for 4 16x16 sub blocks.
+ int blk_fw[4] = { 0, 0, 0, 0 };
+ int use_32x32 = 0;
+
+ if (frames[frame] == NULL) continue;
+
+ ref_mv.row = 0;
+ ref_mv.col = 0;
+ blk_mvs[0] = kZeroMv;
+ blk_mvs[1] = kZeroMv;
+ blk_mvs[2] = kZeroMv;
+ blk_mvs[3] = kZeroMv;
+
+ if (frame == alt_ref_index) {
+ blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
+ use_32x32 = 1;
+ } else {
+ const int thresh_low = 10000;
+ const int thresh_high = 20000;
+ int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+ // Find best match in this frame by MC
+ int err = temporal_filter_find_matching_mb_c(
+ cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+ &ref_mv, blk_mvs, blk_bestsme);
+
+ int err16 =
+ blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
+ int max_err = INT_MIN, min_err = INT_MAX;
+ for (k = 0; k < 4; k++) {
+ if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
+ if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
+ }
+
+ if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) ||
+ ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) {
+ use_32x32 = 1;
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ blk_fw[0] = err < (thresh_low << THR_SHIFT) ? 2
+ : err < (thresh_high << THR_SHIFT) ? 1
+ : 0;
+ blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
+ } else {
+ use_32x32 = 0;
+ for (k = 0; k < 4; k++)
+ blk_fw[k] = blk_bestsme[k] < thresh_low ? 2
+ : blk_bestsme[k] < thresh_high ? 1
+ : 0;
+ }
+
+ for (k = 0; k < 4; k++) {
+ switch (abs(frame - alt_ref_index)) {
+ case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break;
+ case 2:
+ case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break;
+ default: break;
+ }
+ }
+ }
+
+ if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) {
+ // Construct the predictors
+ temporal_filter_predictors_mb_c(
+ mbd, frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+ mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale,
+ mb_col * BW, mb_row * BH, blk_mvs, use_32x32);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int adj_strength = strength + 2 * (mbd->bd - 8);
+ // Apply the filter (YUV)
+ vp9_highbd_apply_temporal_filter(
+ CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride,
+ CONVERT_TO_SHORTPTR(predictor), BW,
+ CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset),
+ CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride,
+ CONVERT_TO_SHORTPTR(predictor + BLK_PELS),
+ CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW,
+ BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+ adj_strength, blk_fw, use_32x32, accumulator, count,
+ accumulator + BLK_PELS, count + BLK_PELS,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ } else {
+ // Apply the filter (YUV)
+ vp9_apply_temporal_filter(
+ f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+ f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+ f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+ mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+ mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+ accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+ }
+#else
+ // Apply the filter (YUV)
+ vp9_apply_temporal_filter(
+ f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+ f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+ f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+ mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+ mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+ accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+ accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *dst1_16;
+ uint16_t *dst2_16;
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < BH; i++) {
+ for (j = 0; j < BW; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1_16[byte] = (uint16_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - BW;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + BLK_PELS;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+ dst1_16[byte] = (uint16_t)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= fixed_divide[count[m]];
+ pval >>= 19;
+ dst2_16[byte] = (uint16_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - mb_uv_width;
+ }
+ } else {
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < BH; i++) {
+ for (j = 0; j < BW; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - BW;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + BLK_PELS;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+ dst1[byte] = (uint8_t)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= fixed_divide[count[m]];
+ pval >>= 19;
+ dst2[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+ }
+#else
+ // Normalize filter output to produce AltRef frame
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < BH; i++) {
+ for (j = 0; j < BW; j++, k++) {
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+
+ dst1[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - BW;
+ }
+
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + BLK_PELS;
+
+ // U
+ unsigned int pval = accumulator[k] + (count[k] >> 1);
+ pval *= fixed_divide[count[k]];
+ pval >>= 19;
+ dst1[byte] = (uint8_t)pval;
+
+ // V
+ pval = accumulator[m] + (count[m] >> 1);
+ pval *= fixed_divide[count[m]];
+ pval >>= 19;
+ dst2[byte] = (uint8_t)pval;
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ mb_y_offset += BW;
+ mb_uv_offset += mb_uv_width;
+ }
+}
+
+static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row,
+ int tile_col) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ TileInfo *tile_info =
+ &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+ const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT;
+ const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT;
+ const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT;
+ const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT;
+ int mb_row;
+
+ for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) {
+ vp9_temporal_filter_iterate_row_c(cpi, &cpi->td, mb_row, mb_col_start,
+ mb_col_end);
+ }
+}
+
+static void temporal_filter_iterate_c(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ int tile_row, tile_col;
+ vp9_init_tile_data(cpi);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ temporal_filter_iterate_tile_c(cpi, tile_row, tile_col);
+ }
+ }
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
+ int *arnr_frames, int *arnr_strength) {
+ const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const int frames_after_arf =
+ vp9_lookahead_depth(cpi->lookahead) - distance - 1;
+ int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+ int frames_bwd;
+ int q, frames, base_strength, strength;
+
+ // Context dependent two pass adjustment to strength.
+ if (oxcf->pass == 2) {
+ base_strength = oxcf->arnr_strength + cpi->twopass.arnr_strength_adjustment;
+ // Clip to allowed range.
+ base_strength = VPXMIN(6, VPXMAX(0, base_strength));
+ } else {
+ base_strength = oxcf->arnr_strength;
+ }
+
+ // Define the forward and backwards filter limits for this arnr group.
+ if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+ if (frames_fwd > distance) frames_fwd = distance;
+
+ frames_bwd = frames_fwd;
+
+ // For even length filter there is one more frame backward
+ // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+ if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+ // Set the baseline active filter size.
+ frames = frames_bwd + 1 + frames_fwd;
+
+ // Adjust the strength based on active max q.
+ if (cpi->common.current_video_frame > 1)
+ q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+ cpi->common.bit_depth));
+ else
+ q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
+ cpi->common.bit_depth));
+ if (q > 16) {
+ strength = base_strength;
+ } else {
+ strength = base_strength - ((16 - q) / 2);
+ if (strength < 0) strength = 0;
+ }
+
+ // Adjust number of frames in filter and strength based on gf boost level.
+ if (frames > group_boost / 150) {
+ frames = group_boost / 150;
+ frames += !(frames & 1);
+ }
+
+ if (strength > group_boost / 300) {
+ strength = group_boost / 300;
+ }
+
+ // Adjustments for second level arf in multi arf case.
+ // Leave commented out place holder for possible filtering adjustment with
+ // new multi-layer arf code.
+ // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed)
+ // if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1;
+
+ // TODO(jingning): Skip temporal filtering for intermediate frames that will
+ // be used as show_existing_frame. Need to further explore the possibility to
+ // apply certain filter.
+ if (gf_group->arf_src_offset[gf_group->index] <
+ cpi->rc.baseline_gf_interval - 1)
+ frames = 1;
+
+ *arnr_frames = frames;
+ *arnr_strength = strength;
+}
+
+void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
+ VP9_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ ARNRFilterData *arnr_filter_data = &cpi->arnr_filter_data;
+ int frame;
+ int frames_to_blur;
+ int start_frame;
+ int strength;
+ int frames_to_blur_backward;
+ int frames_to_blur_forward;
+ struct scale_factors *sf = &arnr_filter_data->sf;
+ YV12_BUFFER_CONFIG **frames = arnr_filter_data->frames;
+ int rdmult;
+
+ // Apply context specific adjustments to the arnr filter parameters.
+ adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+ frames_to_blur_backward = (frames_to_blur / 2);
+ frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+ start_frame = distance + frames_to_blur_forward;
+
+ arnr_filter_data->strength = strength;
+ arnr_filter_data->frame_count = frames_to_blur;
+ arnr_filter_data->alt_ref_index = frames_to_blur_backward;
+
+ // Setup frame pointers, NULL indicates frame not included in filter.
+ for (frame = 0; frame < frames_to_blur; ++frame) {
+ const int which_buffer = start_frame - frame;
+ struct lookahead_entry *buf =
+ vp9_lookahead_peek(cpi->lookahead, which_buffer);
+ frames[frames_to_blur - 1 - frame] = &buf->img;
+ }
+
+ if (frames_to_blur > 0) {
+ // Setup scaling factors. Scaling on each of the arnr frames is not
+ // supported.
+ if (cpi->use_svc) {
+ // In spatial svc the scaling factors might be less then 1/2.
+ // So we will use non-normative scaling.
+ int frame_used = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(
+ sf, get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height,
+ get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height, cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(
+ sf, get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height,
+ get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ for (frame = 0; frame < frames_to_blur; ++frame) {
+ if (cm->mi_cols * MI_SIZE != frames[frame]->y_width ||
+ cm->mi_rows * MI_SIZE != frames[frame]->y_height) {
+ if (vpx_realloc_frame_buffer(&cpi->svc.scaled_frames[frame_used],
+ cm->width, cm->height, cm->subsampling_x,
+ cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to reallocate alt_ref_buffer");
+ }
+ frames[frame] = vp9_scale_if_required(
+ cm, frames[frame], &cpi->svc.scaled_frames[frame_used], 0,
+ EIGHTTAP, 0);
+ ++frame_used;
+ }
+ }
+ cm->mi = cm->mip + cm->mi_stride + 1;
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ } else {
+// ARF is produced at the native frame size and resized when coded.
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(
+ sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height,
+ cm->use_highbitdepth);
+#else
+ vp9_setup_scale_factors_for_frame(
+ sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ }
+
+ // Initialize errorperbit and sabperbit.
+ rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
+ set_error_per_bit(&cpi->td.mb, rdmult);
+ vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
+
+ if (!cpi->row_mt)
+ temporal_filter_iterate_c(cpi);
+ else
+ vp9_temporal_filter_row_mt(cpi);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h
new file mode 100644
index 0000000000..553a468280
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARNR_FILT_QINDEX 128
+static const MV kZeroMv = { 0, 0 };
+
+// Block size used in temporal filtering
+#define TF_BLOCK BLOCK_32X32
+#define BH 32
+#define BH_LOG2 5
+#define BW 32
+#define BW_LOG2 5
+#define BLK_PELS ((BH) * (BW)) // Pixels in the block
+#define TF_SHIFT 2
+#define TF_ROUND 3
+#define THR_SHIFT 2
+#define TF_SUB_BLOCK BLOCK_16X16
+#define SUB_BH 16
+#define SUB_BW 16
+
+void vp9_temporal_filter_init(void);
+void vp9_temporal_filter(VP9_COMP *cpi, int distance);
+
+void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
+ int mb_row, int mb_col_start,
+ int mb_col_end);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h
new file mode 100644
index 0000000000..8776dfc068
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_temporal_filter_constants.h
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#include "./vpx_config.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t
+ *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+ };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+ };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+ };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
+ };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+ };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
+ };
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#define DIST_STRIDE ((BW) + 2)
+
+#endif // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c
new file mode 100644
index 0000000000..6c6c04493f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_scan.h"
+
+#include "vp9/encoder/vp9_cost.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tokenize.h"
+
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+ { 9, 63 }, { 9, 61 }, { 9, 59 }, { 9, 57 }, { 9, 55 }, { 9, 53 }, { 9, 51 },
+ { 9, 49 }, { 9, 47 }, { 9, 45 }, { 9, 43 }, { 9, 41 }, { 9, 39 }, { 9, 37 },
+ { 9, 35 }, { 9, 33 }, { 9, 31 }, { 9, 29 }, { 9, 27 }, { 9, 25 }, { 9, 23 },
+ { 9, 21 }, { 9, 19 }, { 9, 17 }, { 9, 15 }, { 9, 13 }, { 9, 11 }, { 9, 9 },
+ { 9, 7 }, { 9, 5 }, { 9, 3 }, { 9, 1 }, { 8, 31 }, { 8, 29 }, { 8, 27 },
+ { 8, 25 }, { 8, 23 }, { 8, 21 }, { 8, 19 }, { 8, 17 }, { 8, 15 }, { 8, 13 },
+ { 8, 11 }, { 8, 9 }, { 8, 7 }, { 8, 5 }, { 8, 3 }, { 8, 1 }, { 7, 15 },
+ { 7, 13 }, { 7, 11 }, { 7, 9 }, { 7, 7 }, { 7, 5 }, { 7, 3 }, { 7, 1 },
+ { 6, 7 }, { 6, 5 }, { 6, 3 }, { 6, 1 }, { 5, 3 }, { 5, 1 }, { 4, 1 },
+ { 3, 1 }, { 2, 1 }, { 1, 1 }, { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+ { 4, 0 }, { 5, 0 }, { 5, 2 }, { 6, 0 }, { 6, 2 }, { 6, 4 }, { 6, 6 },
+ { 7, 0 }, { 7, 2 }, { 7, 4 }, { 7, 6 }, { 7, 8 }, { 7, 10 }, { 7, 12 },
+ { 7, 14 }, { 8, 0 }, { 8, 2 }, { 8, 4 }, { 8, 6 }, { 8, 8 }, { 8, 10 },
+ { 8, 12 }, { 8, 14 }, { 8, 16 }, { 8, 18 }, { 8, 20 }, { 8, 22 }, { 8, 24 },
+ { 8, 26 }, { 8, 28 }, { 8, 30 }, { 9, 0 }, { 9, 2 }, { 9, 4 }, { 9, 6 },
+ { 9, 8 }, { 9, 10 }, { 9, 12 }, { 9, 14 }, { 9, 16 }, { 9, 18 }, { 9, 20 },
+ { 9, 22 }, { 9, 24 }, { 9, 26 }, { 9, 28 }, { 9, 30 }, { 9, 32 }, { 9, 34 },
+ { 9, 36 }, { 9, 38 }, { 9, 40 }, { 9, 42 }, { 9, 44 }, { 9, 46 }, { 9, 48 },
+ { 9, 50 }, { 9, 52 }, { 9, 54 }, { 9, 56 }, { 9, 58 }, { 9, 60 }, { 9, 62 }
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens =
+ dct_cat_lt_10_value_tokens +
+ (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
+ 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp9_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+ 3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
+ 3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
+ 2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
+ 2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
+ 2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864, 512, 512, 512,
+ 512, 0, 512, 512, 512, 512, 864, 1229, 1256, 1453, 1696, 1893, 1652,
+ 1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
+ 2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
+ 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+ 3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
+ 3704, 3750, 3773,
+};
+const int *vp9_dct_cat_lt_10_value_cost =
+ dct_cat_lt_10_value_cost +
+ (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
+
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+/* clang-format off */
+const vpx_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+ -EOB_TOKEN, 2, // 0 = EOB
+ -ZERO_TOKEN, 4, // 1 = ZERO
+ -ONE_TOKEN, 6, // 2 = ONE
+ 8, 12, // 3 = LOW_VAL
+ -TWO_TOKEN, 10, // 4 = TWO
+ -THREE_TOKEN, -FOUR_TOKEN, // 5 = THREE
+ 14, 16, // 6 = HIGH_LOW
+ -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, // 7 = CAT_ONE
+ 18, 20, // 8 = CAT_THREEFOUR
+ -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, // 9 = CAT_THREE
+ -CATEGORY5_TOKEN, -CATEGORY6_TOKEN // 10 = CAT_FIVE
+};
+/* clang-format on */
+
+static const int16_t zero_cost[] = { 0 };
+static const int16_t sign_cost[1] = { 512 };
+static const int16_t cat1_cost[1 << 1] = { 864, 1229 };
+static const int16_t cat2_cost[1 << 2] = { 1256, 1453, 1696, 1893 };
+static const int16_t cat3_cost[1 << 3] = { 1652, 1791, 1884, 2023,
+ 2195, 2334, 2427, 2566 };
+static const int16_t cat4_cost[1 << 4] = { 2079, 2160, 2218, 2299, 2395, 2476,
+ 2534, 2615, 2661, 2742, 2800, 2881,
+ 2977, 3058, 3116, 3197 };
+static const int16_t cat5_cost[1 << 5] = {
+ 2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+ 2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+ 3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773
+};
+const int16_t vp9_cat6_low_cost[256] = {
+ 3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552, 3574,
+ 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763, 3810, 3822,
+ 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008, 4030, 4042, 4053,
+ 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204, 4266, 4278, 4289, 4301,
+ 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440, 4462, 4474, 4485, 4497, 4253,
+ 4265, 4276, 4288, 4310, 4322, 4333, 4345, 4392, 4404, 4415, 4427, 4449, 4461,
+ 4472, 4484, 4546, 4558, 4569, 4581, 4603, 4615, 4626, 4638, 4685, 4697, 4708,
+ 4720, 4742, 4754, 4765, 4777, 4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940,
+ 4987, 4999, 5010, 5022, 5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198,
+ 5210, 5221, 5233, 5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000,
+ 5011, 5023, 5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207,
+ 5219, 5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+ 5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675, 5722,
+ 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911, 5933, 5945,
+ 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107, 5863, 5875, 5886,
+ 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037, 6059, 6071, 6082, 6094,
+ 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248, 6295, 6307, 6318, 6330, 6352,
+ 6364, 6375, 6387, 6458, 6470, 6481, 6493, 6515, 6527, 6538, 6550, 6597, 6609,
+ 6620, 6632, 6654, 6666, 6677, 6689, 6751, 6763, 6774, 6786, 6808, 6820, 6831,
+ 6843, 6890, 6902, 6913, 6925, 6947, 6959, 6970, 6982
+};
+const uint16_t vp9_cat6_high_cost[64] = {
+ 88, 2251, 2727, 4890, 3148, 5311, 5787, 7950, 3666, 5829, 6305,
+ 8468, 6726, 8889, 9365, 11528, 3666, 5829, 6305, 8468, 6726, 8889,
+ 9365, 11528, 7244, 9407, 9883, 12046, 10304, 12467, 12943, 15106, 3666,
+ 5829, 6305, 8468, 6726, 8889, 9365, 11528, 7244, 9407, 9883, 12046,
+ 10304, 12467, 12943, 15106, 7244, 9407, 9883, 12046, 10304, 12467, 12943,
+ 15106, 10822, 12985, 13461, 15624, 13882, 16045, 16521, 18684
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const uint16_t vp9_cat6_high10_high_cost[256] = {
+ 94, 2257, 2733, 4896, 3154, 5317, 5793, 7956, 3672, 5835, 6311,
+ 8474, 6732, 8895, 9371, 11534, 3672, 5835, 6311, 8474, 6732, 8895,
+ 9371, 11534, 7250, 9413, 9889, 12052, 10310, 12473, 12949, 15112, 3672,
+ 5835, 6311, 8474, 6732, 8895, 9371, 11534, 7250, 9413, 9889, 12052,
+ 10310, 12473, 12949, 15112, 7250, 9413, 9889, 12052, 10310, 12473, 12949,
+ 15112, 10828, 12991, 13467, 15630, 13888, 16051, 16527, 18690, 4187, 6350,
+ 6826, 8989, 7247, 9410, 9886, 12049, 7765, 9928, 10404, 12567, 10825,
+ 12988, 13464, 15627, 7765, 9928, 10404, 12567, 10825, 12988, 13464, 15627,
+ 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 7765, 9928, 10404,
+ 12567, 10825, 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566,
+ 17042, 19205, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921,
+ 17084, 17560, 19723, 17981, 20144, 20620, 22783, 4187, 6350, 6826, 8989,
+ 7247, 9410, 9886, 12049, 7765, 9928, 10404, 12567, 10825, 12988, 13464,
+ 15627, 7765, 9928, 10404, 12567, 10825, 12988, 13464, 15627, 11343, 13506,
+ 13982, 16145, 14403, 16566, 17042, 19205, 7765, 9928, 10404, 12567, 10825,
+ 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205,
+ 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, 17084, 17560,
+ 19723, 17981, 20144, 20620, 22783, 8280, 10443, 10919, 13082, 11340, 13503,
+ 13979, 16142, 11858, 14021, 14497, 16660, 14918, 17081, 17557, 19720, 11858,
+ 14021, 14497, 16660, 14918, 17081, 17557, 19720, 15436, 17599, 18075, 20238,
+ 18496, 20659, 21135, 23298, 11858, 14021, 14497, 16660, 14918, 17081, 17557,
+ 19720, 15436, 17599, 18075, 20238, 18496, 20659, 21135, 23298, 15436, 17599,
+ 18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
+ 24237, 24713, 26876
+};
+const uint16_t vp9_cat6_high12_high_cost[1024] = {
+ 100, 2263, 2739, 4902, 3160, 5323, 5799, 7962, 3678, 5841, 6317,
+ 8480, 6738, 8901, 9377, 11540, 3678, 5841, 6317, 8480, 6738, 8901,
+ 9377, 11540, 7256, 9419, 9895, 12058, 10316, 12479, 12955, 15118, 3678,
+ 5841, 6317, 8480, 6738, 8901, 9377, 11540, 7256, 9419, 9895, 12058,
+ 10316, 12479, 12955, 15118, 7256, 9419, 9895, 12058, 10316, 12479, 12955,
+ 15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, 4193, 6356,
+ 6832, 8995, 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831,
+ 12994, 13470, 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633,
+ 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410,
+ 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+ 17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+ 17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193, 6356, 6832, 8995,
+ 7253, 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470,
+ 15633, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512,
+ 13988, 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831,
+ 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211,
+ 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566,
+ 19729, 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509,
+ 13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864,
+ 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+ 18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+ 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605,
+ 18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080,
+ 24243, 24719, 26882, 4193, 6356, 6832, 8995, 7253, 9416, 9892, 12055,
+ 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 7771, 9934, 10410,
+ 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+ 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349,
+ 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151,
+ 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626,
+ 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+ 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+ 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+ 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+ 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+ 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 8286,
+ 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666,
+ 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+ 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027,
+ 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+ 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+ 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018,
+ 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180,
+ 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535,
+ 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759,
+ 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+ 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276,
+ 25752, 27915, 26173, 28336, 28812, 30975, 4193, 6356, 6832, 8995, 7253,
+ 9416, 9892, 12055, 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633,
+ 7771, 9934, 10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988,
+ 16151, 14409, 16572, 17048, 19211, 7771, 9934, 10410, 12573, 10831, 12994,
+ 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349,
+ 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729,
+ 17987, 20150, 20626, 22789, 8286, 10449, 10925, 13088, 11346, 13509, 13985,
+ 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+ 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+ 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+ 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+ 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+ 24719, 26882, 8286, 10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864,
+ 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666,
+ 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+ 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+ 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502,
+ 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882,
+ 12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596,
+ 20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180,
+ 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957,
+ 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+ 22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+ 27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 8286, 10449,
+ 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924,
+ 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+ 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503,
+ 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+ 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020,
+ 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, 17181,
+ 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+ 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+ 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+ 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+ 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+ 27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, 15439, 17602,
+ 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957,
+ 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+ 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+ 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698,
+ 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173,
+ 28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, 22171, 24334,
+ 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, 22213, 22689,
+ 24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, 26688, 28851,
+ 29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628,
+ 25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, 26267, 28430,
+ 26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, 32429, 32905,
+ 35068
+};
+#endif
+
+const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
+ { 0, 0, 0, zero_cost }, // ZERO_TOKEN
+ { 0, 0, 1, sign_cost }, // ONE_TOKEN
+ { 0, 0, 2, sign_cost }, // TWO_TOKEN
+ { 0, 0, 3, sign_cost }, // THREE_TOKEN
+ { 0, 0, 4, sign_cost }, // FOUR_TOKEN
+ { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CATEGORY1_TOKEN
+ { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CATEGORY2_TOKEN
+ { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CATEGORY3_TOKEN
+ { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CATEGORY4_TOKEN
+ { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CATEGORY5_TOKEN
+ { vp9_cat6_prob, 14, CAT6_MIN_VAL, 0 }, // CATEGORY6_TOKEN
+ { 0, 0, 0, zero_cost } // EOB_TOKEN
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
+ { 0, 0, 0, zero_cost }, // ZERO
+ { 0, 0, 1, sign_cost }, // ONE
+ { 0, 0, 2, sign_cost }, // TWO
+ { 0, 0, 3, sign_cost }, // THREE
+ { 0, 0, 4, sign_cost }, // FOUR
+ { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CAT1
+ { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CAT2
+ { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CAT3
+ { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CAT4
+ { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CAT5
+ { vp9_cat6_prob_high12 + 2, 16, CAT6_MIN_VAL, 0 }, // CAT6
+ { 0, 0, 0, zero_cost } // EOB
+};
+const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
+ { 0, 0, 0, zero_cost }, // ZERO
+ { 0, 0, 1, sign_cost }, // ONE
+ { 0, 0, 2, sign_cost }, // TWO
+ { 0, 0, 3, sign_cost }, // THREE
+ { 0, 0, 4, sign_cost }, // FOUR
+ { vp9_cat1_prob, 1, CAT1_MIN_VAL, cat1_cost }, // CAT1
+ { vp9_cat2_prob, 2, CAT2_MIN_VAL, cat2_cost }, // CAT2
+ { vp9_cat3_prob, 3, CAT3_MIN_VAL, cat3_cost }, // CAT3
+ { vp9_cat4_prob, 4, CAT4_MIN_VAL, cat4_cost }, // CAT4
+ { vp9_cat5_prob, 5, CAT5_MIN_VAL, cat5_cost }, // CAT5
+ { vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0 }, // CAT6
+ { 0, 0, 0, zero_cost } // EOB
+};
+#endif
+
+const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
+ { 2, 2 }, { 6, 3 }, { 28, 5 }, { 58, 6 }, { 59, 6 }, { 60, 6 },
+ { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
+};
+
+struct tokenize_b_args {
+ VP9_COMP *cpi;
+ ThreadData *td;
+ TOKENEXTRA **tp;
+};
+
+static void set_entropy_context_b(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct tokenize_b_args *const args = arg;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ vp9_set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, col, row);
+}
+
+static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
+ int16_t token, EXTRABIT extra,
+ unsigned int *counts) {
+ (*t)->context_tree = context_tree;
+ (*t)->token = token;
+ (*t)->extra = extra;
+ (*t)++;
+ ++counts[token];
+}
+
+static INLINE void add_token_no_extra(TOKENEXTRA **t,
+ const vpx_prob *context_tree,
+ int16_t token, unsigned int *counts) {
+ (*t)->context_tree = context_tree;
+ (*t)->token = token;
+ (*t)++;
+ ++counts[token];
+}
+
+static void tokenize_b(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ VP9_COMP *cpi = args->cpi;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TOKENEXTRA **tp = args->tp;
+ uint8_t token_cache[32 * 32];
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ MODE_INFO *mi = xd->mi[0];
+ int pt; /* near block/prev token context index */
+ int c;
+ TOKENEXTRA *t = *tp; /* store tokens starting here */
+ int eob = p->eobs[block];
+ const PLANE_TYPE type = get_plane_type(plane);
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const int16_t *scan, *nb;
+ const ScanOrder *so;
+ const int ref = is_inter_block(mi);
+ unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
+ td->rd_counts.coef_counts[tx_size][type][ref];
+ vpx_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+ cpi->common.fc->coef_probs[tx_size][type][ref];
+ unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
+ td->counts->eob_branch[tx_size][type][ref];
+ const uint8_t *const band = get_band_translate(tx_size);
+ const int tx_eob = 16 << (tx_size << 1);
+ int16_t token;
+ EXTRABIT extra;
+ pt = get_entropy_context(tx_size, pd->above_context + col,
+ pd->left_context + row);
+ so = get_scan(xd, tx_size, type, block);
+ scan = so->scan;
+ nb = so->neighbors;
+ c = 0;
+
+ while (c < eob) {
+ int v = 0;
+ v = qcoeff[scan[c]];
+ ++eob_branch[band[c]][pt];
+
+ while (!v) {
+ add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN,
+ counts[band[c]][pt]);
+
+ token_cache[scan[c]] = 0;
+ ++c;
+ pt = get_coef_context(nb, token_cache, c);
+ v = qcoeff[scan[c]];
+ }
+
+ vp9_get_token_extra(v, &token, &extra);
+
+ add_token(&t, coef_probs[band[c]][pt], token, extra, counts[band[c]][pt]);
+
+ token_cache[scan[c]] = vp9_pt_energy_class[token];
+ ++c;
+ pt = get_coef_context(nb, token_cache, c);
+ }
+ if (c < tx_eob) {
+ ++eob_branch[band[c]][pt];
+ add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN,
+ counts[band[c]][pt]);
+ }
+
+ *tp = t;
+
+ vp9_set_contexts(xd, pd, plane_bsize, tx_size, c > 0, col, row);
+}
+
+struct is_skippable_args {
+ uint16_t *eobs;
+ int *skippable;
+};
+
+static void is_skippable(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *argv) {
+ struct is_skippable_args *args = argv;
+ (void)plane;
+ (void)plane_bsize;
+ (void)tx_size;
+ (void)row;
+ (void)col;
+ args->skippable[0] &= (!args->eobs[block]);
+}
+
+// TODO(yaowu): rewrite and optimize this function to remove the usage of
+// vp9_foreach_transform_block() and simplify is_skippable().
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ int result = 1;
+ struct is_skippable_args args = { x->plane[plane].eobs, &result };
+ vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+ &args);
+ return result;
+}
+
+static void has_high_freq_coeff(int plane, int block, int row, int col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *argv) {
+ struct is_skippable_args *args = argv;
+ int eobs = (tx_size == TX_4X4) ? 3 : 10;
+ (void)plane;
+ (void)plane_bsize;
+ (void)row;
+ (void)col;
+ *(args->skippable) |= (args->eobs[block] > eobs);
+}
+
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ int result = 0;
+ struct is_skippable_args args = { x->plane[plane].eobs, &result };
+ vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane,
+ has_high_freq_coeff, &args);
+ return result;
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int dry_run,
+ int seg_skip, BLOCK_SIZE bsize) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MODE_INFO *const mi = xd->mi[0];
+ const int ctx = vp9_get_skip_context(xd);
+ struct tokenize_b_args arg = { cpi, td, t };
+
+ if (seg_skip) {
+ assert(mi->skip);
+ }
+
+ if (mi->skip) {
+ if (!dry_run && !seg_skip) ++td->counts->skip[ctx][1];
+ reset_skip_context(xd, bsize);
+ return;
+ }
+
+ if (!dry_run) {
+ ++td->counts->skip[ctx][0];
+ vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+ } else {
+ vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h
new file mode 100644
index 0000000000..6407ff9237
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tokenize.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_
+#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_
+
+#include "vp9/common/vp9_entropy.h"
+
+#include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_treewriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOSB_TOKEN 127 // Not signalled, encoder only
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef int32_t EXTRABIT;
+#else
+typedef int16_t EXTRABIT;
+#endif
+
+typedef struct {
+ int16_t token;
+ EXTRABIT extra;
+} TOKENVALUE;
+
+typedef struct {
+ const vpx_prob *context_tree;
+ int16_t token;
+ EXTRABIT extra;
+} TOKENEXTRA;
+
+extern const vpx_tree_index vp9_coef_tree[];
+extern const vpx_tree_index vp9_coef_con_tree[];
+extern const struct vp9_token vp9_coef_encodings[];
+
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+struct VP9_COMP;
+struct ThreadData;
+
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, int dry_run, int seg_skip,
+ BLOCK_SIZE bsize);
+
+typedef struct {
+ const vpx_prob *prob;
+ int len;
+ int base_val;
+ const int16_t *cost;
+} vp9_extra_bit;
+
+// indexed by token value
+extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS];
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+extern const int16_t *vp9_dct_value_cost_ptr;
+/* TODO: The Token field should be broken out into a separate char array to
+ * improve cache locality, since it's needed for costing when the rest of the
+ * fields are not.
+ */
+extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int *vp9_dct_cat_lt_10_value_cost;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const uint16_t vp9_cat6_high_cost[64];
+extern const uint16_t vp9_cat6_high10_high_cost[256];
+extern const uint16_t vp9_cat6_high12_high_cost[1024];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
+ return bit_depth == 8 ? vp9_cat6_high_cost
+ : (bit_depth == 10 ? vp9_cat6_high10_high_cost
+ : vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const uint16_t *vp9_get_high_cost_table(int bit_depth) {
+ (void)bit_depth;
+ return vp9_cat6_high_cost;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ *token = CATEGORY6_TOKEN;
+ if (v >= CAT6_MIN_VAL)
+ *extra = 2 * v - 2 * CAT6_MIN_VAL;
+ else
+ *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+ return;
+ }
+ *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+ *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) return 10;
+ return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+static INLINE int vp9_get_token_cost(int v, int16_t *token,
+ const uint16_t *cat6_high_table) {
+ if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+ EXTRABIT extrabits;
+ *token = CATEGORY6_TOKEN;
+ extrabits = abs(v) - CAT6_MIN_VAL;
+ return vp9_cat6_low_cost[extrabits & 0xff] +
+ cat6_high_table[extrabits >> 8];
+ }
+ *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+ return vp9_dct_cat_lt_10_value_cost[v];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_TOKENIZE_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 0000000000..9f4bafdf83
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1485 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+
+static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+ const GF_GROUP *gf_group, int *tpl_group_frames) {
+ VP9_COMMON *cm = &cpi->common;
+ int frame_idx = 0;
+ int i;
+ int gld_index = -1;
+ int alt_index = -1;
+ int lst_index = -1;
+ int arf_index_stack[MAX_ARF_LAYERS];
+ int arf_stack_size = 0;
+ int extend_frame_count = 0;
+ int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+ int frame_gop_offset = 0;
+
+ RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+ int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+ memset(recon_frame_index, -1, sizeof(recon_frame_index));
+ stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (frame_bufs[i].ref_count == 0) {
+ alloc_frame_mvs(cm, i);
+ if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+ cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth,
+#endif
+ VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+ NULL, NULL, NULL))
+ vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ recon_frame_index[frame_idx] = i;
+ ++frame_idx;
+
+ if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+ }
+ }
+
+ for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+ assert(recon_frame_index[i] >= 0);
+ cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+ }
+
+ *tpl_group_frames = 0;
+
+ // Initialize Golden reference frame.
+ gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+ gf_picture[0].update_type = gf_group->update_type[0];
+ gld_index = 0;
+ ++*tpl_group_frames;
+
+ // Initialize base layer ARF frame
+ gf_picture[1].frame = cpi->Source;
+ gf_picture[1].ref_frame[0] = gld_index;
+ gf_picture[1].ref_frame[1] = lst_index;
+ gf_picture[1].ref_frame[2] = alt_index;
+ gf_picture[1].update_type = gf_group->update_type[1];
+ alt_index = 1;
+ ++*tpl_group_frames;
+
+ // Initialize P frames
+ for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+ struct lookahead_entry *buf;
+ frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+ buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+ if (buf == NULL) break;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+ switch (gf_group->update_type[frame_idx]) {
+ case ARF_UPDATE:
+ stack_push(arf_index_stack, alt_index, arf_stack_size);
+ ++arf_stack_size;
+ alt_index = frame_idx;
+ break;
+ case LF_UPDATE: lst_index = frame_idx; break;
+ case OVERLAY_UPDATE:
+ gld_index = frame_idx;
+ alt_index = stack_pop(arf_index_stack, arf_stack_size);
+ --arf_stack_size;
+ break;
+ case USE_BUF_FRAME:
+ lst_index = alt_index;
+ alt_index = stack_pop(arf_index_stack, arf_stack_size);
+ --arf_stack_size;
+ break;
+ default: break;
+ }
+
+ ++*tpl_group_frames;
+
+ // The length of group of pictures is baseline_gf_interval, plus the
+ // beginning golden frame from last GOP, plus the last overlay frame in
+ // the same GOP.
+ if (frame_idx == gf_group->gf_group_size) break;
+ }
+
+ alt_index = -1;
+ ++frame_idx;
+ ++frame_gop_offset;
+
+ // Extend two frames outside the current gf group.
+ for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+ struct lookahead_entry *buf =
+ vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+ if (buf == NULL) break;
+
+ cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ gf_picture[frame_idx].update_type = LF_UPDATE;
+ lst_index = frame_idx;
+ ++*tpl_group_frames;
+ ++extend_frame_count;
+ ++frame_gop_offset;
+ }
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ memset(tpl_frame->tpl_stats_ptr, 0,
+ tpl_frame->height * tpl_frame->width *
+ sizeof(*tpl_frame->tpl_stats_ptr));
+ tpl_frame->is_valid = 0;
+ }
+}
+
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+ vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+ }
+ vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+ struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+ TplDepFrame *tpl_stats, int tpl_gop_frames) {
+ int frame_idx;
+ free_tpl_frame_stats_list(tpl_gop_stats);
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list,
+ vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+ tpl_gop_stats->size = tpl_gop_frames;
+ for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+ const int mi_rows = tpl_stats[frame_idx].height;
+ const int mi_cols = tpl_stats[frame_idx].width;
+ CHECK_MEM_ERROR(
+ error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+ vpx_calloc(
+ mi_rows * mi_cols,
+ sizeof(
+ *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+ tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+ }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+ MotionField *motion_field,
+ int frame_idx, uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ int step_param;
+ uint32_t bestsme = UINT_MAX;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ // lambda is used to adjust the importance of motion vector consistency.
+ // TODO(angiebird): Figure out lambda's proper value.
+ const int lambda = cpi->tpl_stats[frame_idx].lambda;
+ int_mv nb_full_mvs[NB_MVS_NUM];
+ int nb_full_mv_num;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ nb_full_mv_num =
+ vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+ vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+ lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf, int stride,
+ BLOCK_SIZE bsize, MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ uint32_t bestsme = UINT_MAX;
+ uint32_t distortion;
+ uint32_t sse;
+ int cost_list[5];
+
+ MV best_ref_mv1 = { 0, 0 };
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ // TODO(yunqing): may use higher tap interp filter than 2 taps.
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(
+ x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
+
+ return bestsme;
+}
+
+#else // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf,
+ int stride, BLOCK_SIZE bsize,
+ MV *mv) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const SEARCH_METHODS search_method = NSTEP;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ uint32_t bestsme = UINT_MAX;
+ uint32_t distortion;
+ uint32_t sse;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+ search_method, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, mv, 0, 0);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ // TODO(yunqing): may use higher tap interp filter than 2 taps.
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ bestsme = cpi->find_fractional_mv_step(
+ x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+ USE_2_TAPS);
+
+ return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+ int ref_pos_col, int block, BLOCK_SIZE bsize) {
+ int width = 0, height = 0;
+ int bw = 4 << b_width_log2_lookup[bsize];
+ int bh = 4 << b_height_log2_lookup[bsize];
+
+ switch (block) {
+ case 0:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 1:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 2:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ case 3:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ default: assert(0);
+ }
+
+ return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+ int round;
+ if (ref_pos < 0)
+ round = -(1 + (-ref_pos - 1) / bsize_pix);
+ else
+ round = ref_pos / bsize_pix;
+
+ return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int stride) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+ const int64_t mc_flow = tpl_ptr->mc_flow;
+ const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+ *tpl_ptr = *src_stats;
+ tpl_ptr->mc_flow = mc_flow;
+ tpl_ptr->mc_ref_cost = mc_ref_cost;
+ tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+ }
+ }
+}
+
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
+ TplDepStats *tpl_stats, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int stride, int64_t recon_error,
+ int64_t rate_cost) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ VpxTplBlockStats *tpl_block_stats_ptr =
+ &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+ tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+ tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+ tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+ tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
+ tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
+ tpl_block_stats_ptr->ref_frame_index = src_stats->ref_frame_index;
+ }
+ }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+ TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+ MV mv = tpl_stats->mv.as_mv;
+ int mv_row = mv.row >> 3;
+ int mv_col = mv.col >> 3;
+
+ int ref_pos_row = mi_row * MI_SIZE + mv_row;
+ int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int pix_num = bw * bh;
+
+ // top-left on grid block location in pixel
+ int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+ int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+ int block;
+
+ for (block = 0; block < 4; ++block) {
+ int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+ int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+ if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+ grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+ int overlap_area = get_overlap_area(
+ grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+ int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+ int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+ int64_t mc_flow = tpl_stats->mc_dep_cost -
+ (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+ tpl_stats->intra_cost;
+
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *des_stats =
+ &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+ (ref_mi_col + idx)];
+
+ des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+ des_stats->mc_ref_cost +=
+ ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+ pix_num;
+ assert(overlap_area >= 0);
+ }
+ }
+ }
+ }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ int idx, idy;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *tpl_ptr =
+ &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+ tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+ BLOCK_8X8);
+ }
+ }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ TX_SIZE tx_size, int64_t *recon_error,
+ int64_t *sse, uint16_t *eob) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+ // skip block condition should be handled before this is called.
+ assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
+ qcoeff, dqcoeff, pd->dequant, eob,
+ scan_order->scan, scan_order->iscan);
+ } else {
+ vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+ }
+#else
+ vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+ dqcoeff, pd->dequant, eob, scan_order->scan,
+ scan_order->iscan);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+ *recon_error = VPXMAX(*recon_error, 1);
+
+ *sse = (*sse) >> shift;
+ *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size) {
+ // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+ switch (tx_size) {
+ case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+ case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+ case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+ default: assert(0);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size) {
+ switch (tx_size) {
+ case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+ case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+ case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+ default: assert(0);
+ }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+ int mi_col) {
+ x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.row_max =
+ (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+ x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+ x->mv_limits.col_max =
+ ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+ const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+ int rate_cost = 1;
+ int idx;
+ assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+ for (idx = 0; idx < eob; ++idx) {
+ unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+ rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+ }
+
+ return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ struct scale_factors *sf, GF_PICTURE *gf_picture,
+ int frame_idx, TplDepFrame *tpl_frame,
+ int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+ int64_t *recon_error, int64_t *rate_cost,
+ int64_t *sse) {
+ VP9_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+
+ const int bw = 4 << b_width_log2_lookup[bsize];
+ const int bh = 4 << b_height_log2_lookup[bsize];
+ const int pix_num = bw * bh;
+ int best_rf_idx = -1;
+ int_mv best_mv;
+ int64_t best_inter_cost = INT64_MAX;
+ int64_t inter_cost;
+ int rf_idx;
+ const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+ int64_t best_intra_cost = INT64_MAX;
+ int64_t intra_cost;
+ PREDICTION_MODE mode;
+ int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ MODE_INFO mi_above, mi_left;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ TplDepStats *tpl_stats =
+ &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+ xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+ xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+ // Intra prediction search
+ for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+ uint8_t *src, *dst;
+ int src_stride, dst_stride;
+
+ src = xd->cur_buf->y_buffer + mb_y_offset;
+ src_stride = xd->cur_buf->y_stride;
+
+ dst = &predictor[0];
+ dst_stride = bw;
+
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+ vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+ src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride, xd->bd);
+ vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_highbd_satd(coeff, pix_num);
+ } else {
+ vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_satd(coeff, pix_num);
+ }
+#else
+ vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ intra_cost = vpx_satd(coeff, pix_num);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+ }
+
+ // Motion compensated prediction
+ best_mv.as_int = 0;
+
+ set_mv_limits(cm, x, mi_row, mi_col);
+
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+ MotionField *motion_field;
+#endif
+ if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+ (void)td;
+ motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+ motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ vp9_highbd_build_inter_predictor(
+ CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+ ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+ &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd->bd);
+ vpx_highbd_subtract_block(
+ bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+ vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_highbd_satd(coeff, pix_num);
+ } else {
+ vp9_build_inter_predictor(
+ ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+ 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+ vpx_subtract_block(bh, bw, src_diff, bw,
+ xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_satd(coeff, pix_num);
+ }
+#else
+ vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+ &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE);
+ vpx_subtract_block(bh, bw, src_diff, bw,
+ xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw);
+ vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+ inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+ if (inter_cost < best_inter_cost) {
+ uint16_t eob = 0;
+ best_rf_idx = rf_idx;
+ best_inter_cost = inter_cost;
+ best_mv.as_int = mv.as_int;
+ // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+ // rate_cost will be calculated with the best reference frame.
+ get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+ sse, &eob);
+ *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+ }
+ }
+ best_intra_cost = VPXMAX(best_intra_cost, 1);
+ best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+ tpl_stats->inter_cost = VPXMAX(
+ 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+ tpl_stats->intra_cost = VPXMAX(
+ 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+ tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+ int frame_idx, int rf_idx, int mi_row,
+ int mi_col, struct buf_2d *src,
+ struct buf_2d *pre) {
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ YV12_BUFFER_CONFIG *ref_frame = NULL;
+ int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ ref_frame = gf_picture[ref_frame_idx].frame;
+ src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+ src->stride = xd->cur_buf->y_stride;
+ pre->buf = ref_frame->y_buffer + mb_y_offset;
+ pre->stride = ref_frame->y_stride;
+ assert(src->stride == pre->stride);
+ return 1;
+ } else {
+ printf("invalid ref_frame_idx");
+ assert(ref_frame_idx != -1);
+ return 0;
+ }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+ { -1, 0 },
+ { 0, -1 },
+ { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+ int mi_col) {
+ return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ int i;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int_mv nearest_mv, near_mv, invalid_mv;
+ nearest_mv.as_int = INVALID_MV;
+ near_mv.as_int = INVALID_MV;
+ invalid_mv.as_int = INVALID_MV;
+ for (i = 0; i < MV_REF_POS_NUM; ++i) {
+ int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+ int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+ assert(mv_ref_pos[i].row <= 0);
+ assert(mv_ref_pos[i].col <= 0);
+ if (nb_row >= 0 && nb_col >= 0) {
+ if (nearest_mv.as_int == INVALID_MV) {
+ nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+ } else {
+ int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+ if (mv.as_int == nearest_mv.as_int) {
+ continue;
+ } else {
+ near_mv = mv;
+ break;
+ }
+ }
+ }
+ }
+ if (nearest_mv.as_int == INVALID_MV) {
+ nearest_mv.as_mv.row = 0;
+ nearest_mv.as_mv.col = 0;
+ }
+ if (near_mv.as_int == INVALID_MV) {
+ near_mv.as_mv.row = 0;
+ near_mv.as_mv.col = 0;
+ }
+ if (mv_mode == NEAREST_MV_MODE) {
+ return nearest_mv;
+ }
+ if (mv_mode == NEAR_MV_MODE) {
+ return near_mv;
+ }
+ assert(0);
+ return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+ MotionField *motion_field,
+ TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ int_mv mv;
+ switch (mv_mode) {
+ case ZERO_MV_MODE:
+ mv.as_mv.row = 0;
+ mv.as_mv.col = 0;
+ break;
+ case NEW_MV_MODE:
+ mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+ break;
+ case NEAREST_MV_MODE:
+ mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+ break;
+ case NEAR_MV_MODE:
+ mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+ break;
+ default:
+ mv.as_int = INVALID_MV;
+ assert(0);
+ break;
+ }
+ return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *mv) {
+ uint32_t sse;
+ struct buf_2d src;
+ struct buf_2d pre;
+ MV full_mv;
+ *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+ mi_row, mi_col);
+ full_mv = get_full_mv(&mv->as_mv);
+ if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+ &src, &pre)) {
+ // TODO(angiebird): Consider subpixel when computing the sse.
+ cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+ pre.stride, &sse);
+ return (double)(sse << VP9_DIST_SCALE_LOG2);
+ } else {
+ assert(0);
+ return 0;
+ }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+ // TODO(angiebird): The probabilities are roughly inferred from
+ // default_inter_mode_probs. Check if there is a better way to set the
+ // probabilities.
+ const int zero_mv_prob = 16;
+ const int new_mv_prob = 24 * 1;
+ const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+ assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+ switch (mv_mode) {
+ case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+ case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+ case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+ case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+ default: assert(0); return -1;
+ }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+ double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+ log2(1 + abs(new_mv->col - ref_mv->col));
+ mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+ return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+ TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+ int mi_col) {
+ double mv_cost = get_mv_mode_cost(mv_mode);
+ if (mv_mode == NEW_MV_MODE) {
+ MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+ bsize, mi_row, mi_col)
+ .as_mv;
+ MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+ tpl_frame, bsize, mi_row, mi_col)
+ .as_mv;
+ MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+ bsize, mi_row, mi_col)
+ .as_mv;
+ double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+ double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+ mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+ }
+ return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int_mv *mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ double mv_dist =
+ get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+ double mv_cost =
+ get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+ double mult = 180;
+
+ return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture,
+ MotionField *motion_field, int frame_idx,
+ TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ double *rd, int_mv *mv) {
+ int best_mv_mode = ZERO_MV_MODE;
+ int update = 0;
+ int mv_mode;
+ *rd = 0;
+ for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+ double this_rd;
+ int_mv this_mv;
+ if (mv_mode == NEW_MV_MODE) {
+ continue;
+ }
+ this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+ if (update == 0) {
+ *rd = this_rd;
+ *mv = this_mv;
+ best_mv_mode = mv_mode;
+ update = 1;
+ } else {
+ if (this_rd < *rd) {
+ *rd = this_rd;
+ *mv = this_mv;
+ best_mv_mode = mv_mode;
+ }
+ }
+ }
+ return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture, MotionField *motion_field,
+ int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int tmp_mv_mode_arr[kMvPreCheckSize];
+ int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+ double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+ int_mv *select_mv_arr = cpi->select_mv_arr;
+ int_mv tmp_select_mv_arr[kMvPreCheckSize];
+ int stride = tpl_frame->stride;
+ double new_mv_rd = 0;
+ double no_new_mv_rd = 0;
+ double this_new_mv_rd = 0;
+ double this_no_new_mv_rd = 0;
+ int idx;
+ int tmp_idx;
+ assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+ // no new mv
+ // diagonal scan order
+ tmp_idx = 0;
+ for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ double this_rd;
+ int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+ mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+ cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+ bsize, nb_row, nb_col, &this_rd, mv);
+ if (r == 0 && c == 0) {
+ this_no_new_mv_rd = this_rd;
+ }
+ no_new_mv_rd += this_rd;
+ tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+ tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+ ++tmp_idx;
+ }
+ }
+ }
+
+ // new mv
+ mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+ this_new_mv_rd = eval_mv_mode(
+ NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+ rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+ new_mv_rd = this_new_mv_rd;
+ // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+ // beforehand.
+ for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ double this_rd;
+ int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+ mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+ cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+ bsize, nb_row, nb_col, &this_rd, mv);
+ new_mv_rd += this_rd;
+ }
+ }
+ }
+
+ // update best_mv_mode
+ tmp_idx = 0;
+ if (no_new_mv_rd < new_mv_rd) {
+ for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+ int r;
+ for (r = 0; r <= idx; ++r) {
+ int c = idx - r;
+ int nb_row = mi_row + r * mi_height;
+ int nb_col = mi_col + c * mi_width;
+ if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+ mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+ select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+ ++tmp_idx;
+ }
+ }
+ }
+ rd_diff_arr[mi_row * stride + mi_col] = 0;
+ } else {
+ rd_diff_arr[mi_row * stride + mi_col] =
+ (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+ }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+ GF_PICTURE *gf_picture,
+ MotionField *motion_field, int frame_idx,
+ TplDepFrame *tpl_frame, int rf_idx,
+ BLOCK_SIZE bsize) {
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int unit_rows = tpl_frame->mi_rows / mi_height;
+ const int unit_cols = tpl_frame->mi_cols / mi_width;
+ const int max_diagonal_lines = unit_rows + unit_cols - 1;
+ int idx;
+ for (idx = 0; idx < max_diagonal_lines; ++idx) {
+ int r;
+ for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+ ++r) {
+ int c = idx - r;
+ int mi_row = r * mi_height;
+ int mi_col = c * mi_width;
+ assert(c >= 0 && c < unit_cols);
+ assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+ assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+ predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+ rf_idx, bsize, mi_row, mi_col);
+ }
+ }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+ MotionField *motion_field, int frame_idx,
+ YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ VP9_COMMON *cm = &cpi->common;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mb_y_offset =
+ mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ assert(ref_frame != NULL);
+ set_mv_limits(cm, x, mi_row, mi_col);
+ {
+ int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+ uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+ uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+ const int stride = xd->cur_buf->y_stride;
+ full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+ ref_frame_buf, stride, bsize, mi_row, mi_col,
+ &mv.as_mv);
+ sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+ bsize, &mv.as_mv);
+ vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+ }
+}
+
+static void build_motion_field(
+ VP9_COMP *cpi, int frame_idx,
+ YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+ VP9_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+ const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+ int mi_row, mi_col;
+ int rf_idx;
+
+ tpl_frame->lambda = (pw * ph) >> 2;
+ assert(pw * ph == tpl_frame->lambda << 2);
+
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ if (ref_frame[rf_idx] == NULL) {
+ continue;
+ }
+ vp9_motion_field_reset_mvs(motion_field);
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+ bsize, mi_row, mi_col);
+ }
+ }
+ }
+}
+#endif // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+ int frame_idx, BLOCK_SIZE bsize) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ VpxTplFrameStats *tpl_frame_stats_before_propagation =
+ &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
+ YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+ YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+ VP9_COMMON *cm = &cpi->common;
+ struct scale_factors sf;
+ int rdmult, idx;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+ DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+ uint8_t *predictor;
+#else
+ DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+ DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+ tpl_frame_stats_before_propagation->frame_width = cm->width;
+ tpl_frame_stats_before_propagation->frame_height = cm->height;
+ // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+ vp9_setup_scale_factors_for_frame(
+ &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height,
+ cpi->common.use_highbitdepth);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ else
+ predictor = predictor8;
+#else
+ vp9_setup_scale_factors_for_frame(
+ &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ // Prepare reference frame pointers. If any reference frame slot is
+ // unavailable, the pointer will be set to Null.
+ for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+ int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+ if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ xd->cur_buf = this_frame;
+
+ // Get rd multiplier set up.
+ rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+ set_error_per_bit(&cpi->td.mb, rdmult);
+ vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+ tpl_frame->is_valid = 1;
+
+ cm->base_qindex = tpl_frame->base_qindex;
+ vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+ {
+ int square_block_idx;
+ int rf_idx;
+ for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+ ++square_block_idx) {
+ BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+ build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+ }
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+ &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+ predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+ tpl_frame, rf_idx, bsize);
+ }
+ }
+ }
+#endif // CONFIG_NON_GREEDY_MV
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ int64_t recon_error = 0;
+ int64_t rate_cost = 0;
+ int64_t sse = 0;
+ mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+ src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+ tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+ &sse);
+ // Motion flow dependency dispenser.
+ tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+ tpl_frame->stride);
+
+ tpl_store_before_propagation(
+ tpl_frame_stats_before_propagation->block_stats_list,
+ tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+ recon_error, rate_cost);
+
+ tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+ bsize);
+ }
+ }
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+ int i, j;
+ printf("%d %d\n", h, w);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ printf("%d ", buf[(row + i) * stride + col + j]);
+ }
+ }
+ printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+ dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+ frame_buf->y_width);
+ dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+ frame_buf->uv_height, frame_buf->uv_width);
+ dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+ frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+ const GF_GROUP *gf_group,
+ const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+ int frame_idx;
+ const VP9_COMMON *cm = &cpi->common;
+ int rf_idx;
+ for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ int mi_row, mi_col;
+ int ref_frame_idx;
+ const int mi_height = num_8x8_blocks_high_lookup[bsize];
+ const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+ if (ref_frame_idx != -1) {
+ YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+ const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+ const int ref_gf_frame_offset =
+ gf_group->frame_gop_index[ref_frame_idx];
+ printf("=\n");
+ printf(
+ "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+ "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+ frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+ ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+ int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+ frame_idx, rf_idx, bsize,
+ mi_row, mi_col);
+ printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+ mv.as_mv.col);
+ }
+ }
+ }
+ for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+ for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+ if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+ const TplDepStats *tpl_ptr =
+ &tpl_frame
+ ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+ printf("%f ", tpl_ptr->feature_score);
+ }
+ }
+ }
+ printf("\n");
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ const int mv_mode =
+ tpl_frame
+ ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+ printf("%d ", mv_mode);
+ }
+ }
+ printf("\n");
+
+ dump_frame_buf(gf_picture[frame_idx].frame);
+ dump_frame_buf(ref_frame_buf);
+ }
+ }
+ }
+}
+#endif // DUMP_TPL_STATS
+#endif // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ int frame;
+
+ const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+ int rf_idx;
+
+ vpx_free(cpi->select_mv_arr);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->select_mv_arr,
+ vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+ // TODO(jingning): Reduce the actual memory use for tpl model build up.
+ for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+ if (cpi->tpl_stats[frame].width >= mi_cols &&
+ cpi->tpl_stats[frame].height >= mi_rows &&
+ cpi->tpl_stats[frame].tpl_stats_ptr)
+ continue;
+
+#if CONFIG_NON_GREEDY_MV
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+ vpx_calloc(mi_rows * mi_cols * 4,
+ sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+ vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+ CHECK_MEM_ERROR(
+ &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+ vpx_calloc(mi_rows * mi_cols * 4,
+ sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+ }
+#endif
+ vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+ CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
+ vpx_calloc(mi_rows * mi_cols,
+ sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+ cpi->tpl_stats[frame].is_valid = 0;
+ cpi->tpl_stats[frame].width = mi_cols;
+ cpi->tpl_stats[frame].height = mi_rows;
+ cpi->tpl_stats[frame].stride = mi_cols;
+ cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+ cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+ }
+
+ for (frame = 0; frame < REF_FRAMES; ++frame) {
+ cpi->enc_frame_buf[frame].mem_valid = 0;
+ cpi->enc_frame_buf[frame].released = 1;
+ }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+ int frame;
+#if CONFIG_NON_GREEDY_MV
+ vp9_free_motion_field_info(&cpi->motion_field_info);
+ vpx_free(cpi->select_mv_arr);
+#endif
+ for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+ int rf_idx;
+ for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+ vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+ vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+ }
+#endif
+ vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+ cpi->tpl_stats[frame].is_valid = 0;
+ }
+ free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
+}
+
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int show_frame_count = 0;
+ int frame_idx;
+ // Accumulate tpl stats for each frame in the current group of picture.
+ for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+ const int tpl_stride = tpl_frame->stride;
+ int64_t intra_cost_base = 0;
+ int64_t inter_cost_base = 0;
+ int64_t mc_dep_cost_base = 0;
+ int64_t mc_ref_cost_base = 0;
+ int64_t mc_flow_base = 0;
+ int row, col;
+
+ if (!tpl_frame->is_valid) continue;
+
+ for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+ for (col = 0; col < cm->mi_cols; ++col) {
+ TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+ intra_cost_base += this_stats->intra_cost;
+ inter_cost_base += this_stats->inter_cost;
+ mc_dep_cost_base += this_stats->mc_dep_cost;
+ mc_ref_cost_base += this_stats->mc_ref_cost;
+ mc_flow_base += this_stats->mc_flow;
+ }
+ }
+
+ cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+ cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+ cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+ ++show_frame_count;
+ }
+}
+#endif // CONFIG_RATE_CTRL
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+ GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int tpl_group_frames = 0;
+ int frame_idx;
+ cpi->tpl_bsize = BLOCK_32X32;
+
+ init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+ init_tpl_stats(cpi);
+
+ init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+ cpi->tpl_stats, tpl_group_frames);
+
+ // Backward propagation from tpl_group_frames to 1.
+ for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+ if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+ mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+ }
+#if CONFIG_NON_GREEDY_MV
+ cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+ dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif // DUMP_TPL_STATS
+#endif // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+ if (cpi->oxcf.use_simple_encode_api) {
+ accumulate_frame_tpl_stats(cpi);
+ }
+#endif // CONFIG_RATE_CTRL
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 0000000000..04beb22610
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct GF_PICTURE {
+ YV12_BUFFER_CONFIG *frame;
+ int ref_frame[3];
+ FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c
new file mode 100644
index 0000000000..0fc078e0a7
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_treewriter.h"
+
+static void tree2tok(struct vp9_token *tokens, const vpx_tree_index *tree,
+ int i, int v, int l) {
+ v += v;
+ ++l;
+
+ do {
+ const vpx_tree_index j = tree[i++];
+ if (j <= 0) {
+ tokens[-j].value = v;
+ tokens[-j].len = l;
+ } else {
+ tree2tok(tokens, tree, j, v, l);
+ }
+ } while (++v & 1);
+}
+
+void vp9_tokens_from_tree(struct vp9_token *tokens,
+ const vpx_tree_index *tree) {
+ tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, vpx_tree tree,
+ unsigned int branch_ct[][2],
+ const unsigned int num_events[]) {
+ unsigned int left, right;
+
+ if (tree[i] <= 0)
+ left = num_events[-tree[i]];
+ else
+ left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+ if (tree[i + 1] <= 0)
+ right = num_events[-tree[i + 1]];
+ else
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+ branch_ct[i >> 1][0] = left;
+ branch_ct[i >> 1][1] = right;
+ return left + right;
+}
+
+void vp9_tree_probs_from_distribution(vpx_tree tree,
+ unsigned int branch_ct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
+ convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h
new file mode 100644
index 0000000000..86c5fa2244
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_treewriter.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_
+#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_
+
+#include "vpx_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_tree_probs_from_distribution(vpx_tree tree,
+ unsigned int branch_ct[/* n - 1 */][2],
+ const unsigned int num_events[/* n */]);
+
+struct vp9_token {
+ int value;
+ int len;
+};
+
+void vp9_tokens_from_tree(struct vp9_token *, const vpx_tree_index *);
+
+static INLINE void vp9_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+ const vpx_prob *probs, int bits, int len,
+ vpx_tree_index i) {
+ do {
+ const int bit = (bits >> --len) & 1;
+ vpx_write(w, bit, probs[i >> 1]);
+ i = tree[i + bit];
+ } while (len);
+}
+
+static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
+ const vpx_prob *probs,
+ const struct vp9_token *token) {
+ vp9_write_tree(w, tree, probs, token->value, token->len, 0);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 0000000000..97f182c660
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+ uint32_t *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+ const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+ const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+ const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+ __m128i dist_first, dist_second;
+
+ dist_first = _mm_sub_epi32(a_first, b_first);
+ dist_second = _mm_sub_epi32(a_second, b_second);
+ dist_first = _mm_mullo_epi32(dist_first, dist_first);
+ dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+ _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+ __m128i dist_reg, dist_left, dist_right;
+
+ dist_reg = _mm_loadu_si128((const __m128i *)dist);
+ dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+ dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+ *sum = _mm_add_epi32(dist_reg, dist_left);
+ *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+ __m128i *sum_second) {
+ highbd_get_sum_4(dist, sum_first);
+ highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+ const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+ const __m128i weight_u32 = _mm_set1_epi32(weight);
+ const __m128i sixteen = _mm_set1_epi32(16);
+ const __m128i zero = _mm_setzero_si128();
+
+ // modifier * 3 / index;
+ const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+ const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+ const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+ const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+ const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+ const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+ const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+ const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+ // Now we have
+ // mul_lo: 00 a1 00 a0
+ // mul_hi: 00 a3 00 a2
+ // Unpack as 64 bit words to get even and odd elements
+ // unpack_lo: 00 a2 00 a0
+ // unpack_hi: 00 a3 00 a1
+ // Then we can shift and OR the results to get everything in 32-bits
+ const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+ const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+ const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+ const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+ // Round
+ *output = _mm_add_epi32(mul, rounding_u32);
+ *output = _mm_srl_epi32(*output, strength_u128);
+
+ // Multiply with the weight
+ *output = _mm_min_epu32(*output, sixteen);
+ *output = _mm_sub_epi32(sixteen, *output);
+ *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+ const __m128i *sum_0_u32,
+ const __m128i *sum_1_u32,
+ const __m128i *mul_constants_0,
+ const __m128i *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
+ highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+ weight);
+ highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+ weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+ const __m128i sum_second_u32,
+ const uint16_t *pred,
+ uint16_t *count,
+ uint32_t *accumulator) {
+ // Cast down to 16-bit ints
+ const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+ pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+ *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+ __m128i *reg_second) {
+ highbd_read_dist_4(dist, reg_first);
+ highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+ int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+ __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 8 entries from chroma.
+ highbd_read_dist_8(u_dist, u_first, u_second);
+ highbd_read_dist_8(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ __m128i u_reg, v_reg;
+
+ highbd_read_dist_4(u_dist, &u_reg);
+
+ *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+ *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+ highbd_read_dist_4(v_dist, &v_reg);
+
+ *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+ *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+ }
+}
+
+static void vp9_highbd_apply_temporal_filter_luma_8(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_first,
+ const uint32_t *const *neighbors_second, int top_weight,
+ int bottom_weight) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_first, mul_second;
+
+ __m128i sum_row_1_first, sum_row_1_second;
+ __m128i sum_row_2_first, sum_row_2_second;
+ __m128i sum_row_3_first, sum_row_3_second;
+
+ __m128i u_first, u_second;
+ __m128i v_first, v_second;
+
+ __m128i sum_row_first;
+ __m128i sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(block_width == 8);
+
+ (void)block_width;
+
+ // First row
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+ // Add luma values
+ highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+ // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+ sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+ sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ weight = bottom_weight;
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength,
+ rounding, weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ }
+
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_highbd_apply_temporal_filter_luma(
+ const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_first;
+ const uint32_t *const *neighbors_second;
+
+ // Left
+ neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ vp9_highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+ }
+
+ // Right
+ neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+ vp9_highbd_apply_temporal_filter_luma_8(
+ y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+ const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+ __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+ __m128i y_reg_fst, y_reg_snd;
+ if (!ss_x) {
+ highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+ y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+ y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+ }
+ } else {
+ // Temporary
+ __m128i y_fst, y_snd;
+
+ // First 8
+ highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+ y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+ }
+
+ y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+ // Second 8
+ highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+ y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+ }
+
+ y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+ }
+
+ *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+ *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+ *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+ *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_highbd_apply_temporal_filter_chroma_8(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+ int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+ const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+ int top_weight, int bottom_weight, const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_fst, mul_snd;
+
+ __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+ __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+ __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+ __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+ __m128i u_sum_row_fst, v_sum_row_fst;
+ __m128i u_sum_row_snd, v_sum_row_snd;
+
+ // Loop variable
+ unsigned int h;
+
+ (void)uv_block_width;
+
+ // First row
+ mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+ mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+ // Add chroma values
+ highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+ highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]);
+ mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+ mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_highbd_apply_temporal_filter_chroma(
+ const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_fst;
+ const uint32_t *const *neighbors_snd;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ } else {
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ vp9_highbd_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+ neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_sse4_1(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+ uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+ highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength,
+ blk_fw, use_whole_blk, y_accum, y_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+ vp9_highbd_apply_temporal_filter_chroma(
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000000..7571bfccac
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+ __m128i dist_first;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+ const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+ __m128i dist_first, dist_second;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_second = _mm_sub_epi16(a_second, b_second);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+ dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+ _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+ *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+ __m128i *reg_second) {
+ read_dist_8(dist, reg_first);
+ read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const __m128i *weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = *weight;
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+ uint16_t *count, uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+ __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+ count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+ __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+ pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+ __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+ __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+ _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+ count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+ _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+ pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+ pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+ pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+ pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+ accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+ accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+ accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+ accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+ __m128i dist_reg, dist_left, dist_right;
+
+ dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+ dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+ dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+ *sum = _mm_adds_epu16(dist_reg, dist_left);
+ *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+ __m128i *sum_second) {
+ get_sum_8(y_dist, sum_first);
+ get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+ const uint16_t *v_dist,
+ __m128i *u_first, __m128i *u_second,
+ __m128i *v_first,
+ __m128i *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 16 entries from chroma.
+ read_dist_16(u_dist, u_first, u_second);
+ read_dist_16(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ __m128i u_reg, v_reg;
+
+ read_dist_8(u_dist, &u_reg);
+
+ *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+ *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+ read_dist_8(v_dist, &v_reg);
+
+ *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+ *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+ }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+ const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+ const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+ *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+ int ss_x, int ss_y,
+ __m128i *u_mod,
+ __m128i *v_mod) {
+ __m128i y_reg;
+ if (!ss_x) {
+ read_dist_8(y_dist, &y_reg);
+ if (ss_y == 1) {
+ __m128i y_tmp;
+ read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+ y_reg = _mm_adds_epu16(y_reg, y_tmp);
+ }
+ } else {
+ __m128i y_first, y_second;
+ read_dist_16(y_dist, &y_first, &y_second);
+ if (ss_y == 1) {
+ __m128i y_tmp_0, y_tmp_1;
+ read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+ y_first = _mm_adds_epu16(y_first, y_tmp_0);
+ y_second = _mm_adds_epu16(y_second, y_tmp_1);
+ }
+
+ hadd_epu16(&y_first, &y_first);
+ hadd_epu16(&y_second, &y_second);
+
+ y_reg = _mm_packus_epi32(y_first, y_second);
+ }
+
+ *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+ *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_luma_16(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors_first,
+ const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ __m128i weight_first, weight_second;
+
+ __m128i mul_first, mul_second;
+
+ __m128i sum_row_1_first, sum_row_1_second;
+ __m128i sum_row_2_first, sum_row_2_second;
+ __m128i sum_row_3_first, sum_row_3_second;
+
+ __m128i u_first, u_second;
+ __m128i v_first, v_second;
+
+ __m128i sum_row_first;
+ __m128i sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(block_width == 16);
+ (void)block_width;
+
+ // Initialize the weights
+ if (blk_fw) {
+ weight_first = _mm_set1_epi16(blk_fw[0]);
+ weight_second = _mm_set1_epi16(blk_fw[1]);
+ } else {
+ weight_first = _mm_set1_epi16(top_weight);
+ weight_second = weight_first;
+ }
+
+ // First row
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+ // Add luma values
+ get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ if (blk_fw) {
+ weight_first = _mm_set1_epi16(blk_fw[2]);
+ weight_second = _mm_set1_epi16(blk_fw[3]);
+ } else {
+ weight_first = _mm_set1_epi16(bottom_weight);
+ weight_second = weight_first;
+ }
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_dist += DIST_STRIDE;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+ sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+ &weight_second);
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_apply_temporal_filter_luma(
+ const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+ unsigned int block_height, int ss_x, int ss_y, int strength,
+ const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors_first;
+ const int16_t *const *neighbors_second;
+
+ if (block_width == 16) {
+ // Special Case: The blockwidth is 16 and we are operating on a row of 16
+ // chroma pixels. In this case, we can't use the usual left-middle-right
+ // pattern. We also don't support splitting now.
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ if (use_whole_blk) {
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ } else {
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ vp9_apply_temporal_filter_luma_16(
+ y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_chroma_8(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+
+ __m128i weight;
+
+ __m128i mul;
+
+ __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+ __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+ __m128i u_sum_row, v_sum_row;
+
+ // Loop variable
+ unsigned int h;
+
+ // Initialize weight
+ if (blk_fw) {
+ weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
+ blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
+ } else {
+ weight = _mm_set1_epi16(top_weight);
+ }
+
+ // First row
+ mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+ // Add chroma values
+ get_sum_8(u_dist, &u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+ u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+ get_sum_8(v_dist, &v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul = _mm_load_si128((const __m128i *)neighbors[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2],
+ blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]);
+ } else {
+ weight = _mm_set1_epi16(bottom_weight);
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+ u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+ v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_apply_temporal_filter_chroma(
+ const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+ int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ } else {
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+ ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ }
+
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ vp9_apply_temporal_filter_chroma_8(
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+ ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_sse4_1(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *const blk_fw,
+ int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+ uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+ const int *blk_fw_ptr = blk_fw;
+
+ uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+ store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+ ss_x, ss_y, strength, blk_fw_ptr,
+ use_whole_blk, y_accum, y_count, y_dist_ptr,
+ u_dist_ptr, v_dist_ptr);
+
+ vp9_apply_temporal_filter_chroma(
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
new file mode 100644
index 0000000000..e9943447fd
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -0,0 +1,1537 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride) {
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+ __m128i mask;
+
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 4);
+ in[1] = _mm_slli_epi16(in[1], 4);
+ in[2] = _mm_slli_epi16(in[2], 4);
+ in[3] = _mm_slli_epi16(in[3], 4);
+
+ mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+ in[0] = _mm_add_epi16(in[0], mask);
+ in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+ const __m128i kOne = _mm_set1_epi16(1);
+ __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ __m128i out01 = _mm_add_epi16(in01, kOne);
+ __m128i out23 = _mm_add_epi16(in23, kOne);
+ out01 = _mm_srai_epi16(out01, 2);
+ out23 = _mm_srai_epi16(out23, 2);
+ store_output(&out01, (output + 0 * 8));
+ store_output(&out23, (output + 1 * 8));
+}
+
+static INLINE void transpose_4x4(__m128i *res) {
+ // Combine and transpose
+ // 00 01 02 03 20 21 22 23
+ // 10 11 12 13 30 31 32 33
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+ const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+ // 00 10 20 30 01 11 21 31
+ // 02 12 22 32 03 13 23 33
+ // only use the first 4 16-bit integers
+ res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+ res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+static void fdct4_sse2(__m128i *in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u[4], v[4];
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[3], in[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
+ u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
+ u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[1] = _mm_packs_epi32(u[2], u[3]);
+ transpose_4x4(in);
+}
+
+static void fadst4_sse2(__m128i *in) {
+ const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+ const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+ const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+ const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+ const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
+ const __m128i kZero = _mm_setzero_si128();
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u[8], v[8];
+ __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+ u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+ u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+ u[2] = _mm_unpacklo_epi16(in7, kZero);
+ u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+ v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[2]);
+ in[1] = _mm_packs_epi32(u[1], u[3]);
+ transpose_4x4(in);
+}
+
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[4];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, stride);
+ fadst4_sse2(in);
+ fdct4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, stride);
+ fdct4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_4x4(input, in, stride);
+ fadst4_sse2(in);
+ fadst4_sse2(in);
+ write_buffer_4x4(output, in);
+ break;
+ }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ in[0] = _mm_slli_epi16(in[0], 2);
+ in[1] = _mm_slli_epi16(in[1], 2);
+ in[2] = _mm_slli_epi16(in[2], 2);
+ in[3] = _mm_slli_epi16(in[3], 2);
+ in[4] = _mm_slli_epi16(in[4], 2);
+ in[5] = _mm_slli_epi16(in[5], 2);
+ in[6] = _mm_slli_epi16(in[6], 2);
+ in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
+ __m128i sign0 = _mm_srai_epi16(res[0], 15);
+ __m128i sign1 = _mm_srai_epi16(res[1], 15);
+ __m128i sign2 = _mm_srai_epi16(res[2], 15);
+ __m128i sign3 = _mm_srai_epi16(res[3], 15);
+ __m128i sign4 = _mm_srai_epi16(res[4], 15);
+ __m128i sign5 = _mm_srai_epi16(res[5], 15);
+ __m128i sign6 = _mm_srai_epi16(res[6], 15);
+ __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+ if (bit == 2) {
+ const __m128i const_rounding = _mm_set1_epi16(1);
+ res[0] = _mm_add_epi16(res[0], const_rounding);
+ res[1] = _mm_add_epi16(res[1], const_rounding);
+ res[2] = _mm_add_epi16(res[2], const_rounding);
+ res[3] = _mm_add_epi16(res[3], const_rounding);
+ res[4] = _mm_add_epi16(res[4], const_rounding);
+ res[5] = _mm_add_epi16(res[5], const_rounding);
+ res[6] = _mm_add_epi16(res[6], const_rounding);
+ res[7] = _mm_add_epi16(res[7], const_rounding);
+ }
+
+ res[0] = _mm_sub_epi16(res[0], sign0);
+ res[1] = _mm_sub_epi16(res[1], sign1);
+ res[2] = _mm_sub_epi16(res[2], sign2);
+ res[3] = _mm_sub_epi16(res[3], sign3);
+ res[4] = _mm_sub_epi16(res[4], sign4);
+ res[5] = _mm_sub_epi16(res[5], sign5);
+ res[6] = _mm_sub_epi16(res[6], sign6);
+ res[7] = _mm_sub_epi16(res[7], sign7);
+
+ if (bit == 1) {
+ res[0] = _mm_srai_epi16(res[0], 1);
+ res[1] = _mm_srai_epi16(res[1], 1);
+ res[2] = _mm_srai_epi16(res[2], 1);
+ res[3] = _mm_srai_epi16(res[3], 1);
+ res[4] = _mm_srai_epi16(res[4], 1);
+ res[5] = _mm_srai_epi16(res[5], 1);
+ res[6] = _mm_srai_epi16(res[6], 1);
+ res[7] = _mm_srai_epi16(res[7], 1);
+ } else {
+ res[0] = _mm_srai_epi16(res[0], 2);
+ res[1] = _mm_srai_epi16(res[1], 2);
+ res[2] = _mm_srai_epi16(res[2], 2);
+ res[3] = _mm_srai_epi16(res[3], 2);
+ res[4] = _mm_srai_epi16(res[4], 2);
+ res[5] = _mm_srai_epi16(res[5], 2);
+ res[6] = _mm_srai_epi16(res[6], 2);
+ res[7] = _mm_srai_epi16(res[7], 2);
+ }
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+ int stride) {
+ store_output(&res[0], (output + 0 * stride));
+ store_output(&res[1], (output + 1 * stride));
+ store_output(&res[2], (output + 2 * stride));
+ store_output(&res[3], (output + 3 * stride));
+ store_output(&res[4], (output + 4 * stride));
+ store_output(&res[5], (output + 5 * stride));
+ store_output(&res[6], (output + 6 * stride));
+ store_output(&res[7], (output + 7 * stride));
+}
+
+static void fdct8_sse2(__m128i *in) {
+ // constants
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 1
+ s0 = _mm_add_epi16(in[0], in[7]);
+ s1 = _mm_add_epi16(in[1], in[6]);
+ s2 = _mm_add_epi16(in[2], in[5]);
+ s3 = _mm_add_epi16(in[3], in[4]);
+ s4 = _mm_sub_epi16(in[3], in[4]);
+ s5 = _mm_sub_epi16(in[2], in[5]);
+ s6 = _mm_sub_epi16(in[1], in[6]);
+ s7 = _mm_sub_epi16(in[0], in[7]);
+
+ u0 = _mm_add_epi16(s0, s3);
+ u1 = _mm_add_epi16(s1, s2);
+ u2 = _mm_sub_epi16(s1, s2);
+ u3 = _mm_sub_epi16(s0, s3);
+ // interleave and perform butterfly multiplication/addition
+ v0 = _mm_unpacklo_epi16(u0, u1);
+ v1 = _mm_unpackhi_epi16(u0, u1);
+ v2 = _mm_unpacklo_epi16(u2, u3);
+ v3 = _mm_unpackhi_epi16(u2, u3);
+
+ u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+ u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+ u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+ u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+ u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+ u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+ u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+ u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[4] = _mm_packs_epi32(u2, u3);
+ in[6] = _mm_packs_epi32(u6, u7);
+
+ // stage 2
+ // interleave and perform butterfly multiplication/addition
+ u0 = _mm_unpacklo_epi16(s6, s5);
+ u1 = _mm_unpackhi_epi16(s6, s5);
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+ u0 = _mm_packs_epi32(v0, v1);
+ u1 = _mm_packs_epi32(v2, v3);
+
+ // stage 3
+ s0 = _mm_add_epi16(s4, u0);
+ s1 = _mm_sub_epi16(s4, u0);
+ s2 = _mm_sub_epi16(s7, u1);
+ s3 = _mm_add_epi16(s7, u1);
+
+ // stage 4
+ u0 = _mm_unpacklo_epi16(s0, s3);
+ u1 = _mm_unpackhi_epi16(s0, s3);
+ u2 = _mm_unpacklo_epi16(s1, s2);
+ u3 = _mm_unpackhi_epi16(s1, s2);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+ v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+ v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+ v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+ v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+ v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+ v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+ v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+ // shift and rounding
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v0, v1);
+ in[3] = _mm_packs_epi32(v4, v5);
+ in[5] = _mm_packs_epi32(v2, v3);
+ in[7] = _mm_packs_epi32(v6, v7);
+
+ // transpose
+ transpose_16bit_8x8(in, in);
+}
+
+static void fadst8_sse2(__m128i *in) {
+ // Constants
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__const_0 = _mm_setzero_si128();
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+ // properly aligned for butterfly input
+ in0 = in[7];
+ in1 = in[0];
+ in2 = in[5];
+ in3 = in[2];
+ in4 = in[3];
+ in5 = in[4];
+ in6 = in[1];
+ in7 = in[6];
+
+ // column transformation
+ // stage 1
+ // interleave and multiply/add into 32-bit integer
+ s0 = _mm_unpacklo_epi16(in0, in1);
+ s1 = _mm_unpackhi_epi16(in0, in1);
+ s2 = _mm_unpacklo_epi16(in2, in3);
+ s3 = _mm_unpackhi_epi16(in2, in3);
+ s4 = _mm_unpacklo_epi16(in4, in5);
+ s5 = _mm_unpackhi_epi16(in4, in5);
+ s6 = _mm_unpacklo_epi16(in6, in7);
+ s7 = _mm_unpackhi_epi16(in6, in7);
+
+ u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+ u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+ u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+ u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+ u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+ u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+ u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+ u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+ u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+ u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+ u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+ u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+ u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+ u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+ u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+ u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+ // addition
+ w0 = _mm_add_epi32(u0, u8);
+ w1 = _mm_add_epi32(u1, u9);
+ w2 = _mm_add_epi32(u2, u10);
+ w3 = _mm_add_epi32(u3, u11);
+ w4 = _mm_add_epi32(u4, u12);
+ w5 = _mm_add_epi32(u5, u13);
+ w6 = _mm_add_epi32(u6, u14);
+ w7 = _mm_add_epi32(u7, u15);
+ w8 = _mm_sub_epi32(u0, u8);
+ w9 = _mm_sub_epi32(u1, u9);
+ w10 = _mm_sub_epi32(u2, u10);
+ w11 = _mm_sub_epi32(u3, u11);
+ w12 = _mm_sub_epi32(u4, u12);
+ w13 = _mm_sub_epi32(u5, u13);
+ w14 = _mm_sub_epi32(u6, u14);
+ w15 = _mm_sub_epi32(u7, u15);
+
+ // shift and rounding
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+ v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+ v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+ v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+ v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+ v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+ v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+ v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+ v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+ u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+ u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+ u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+ u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+ u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+ u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+ u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+ // back to 16-bit and pack 8 integers into __m128i
+ in[0] = _mm_packs_epi32(u0, u1);
+ in[1] = _mm_packs_epi32(u2, u3);
+ in[2] = _mm_packs_epi32(u4, u5);
+ in[3] = _mm_packs_epi32(u6, u7);
+ in[4] = _mm_packs_epi32(u8, u9);
+ in[5] = _mm_packs_epi32(u10, u11);
+ in[6] = _mm_packs_epi32(u12, u13);
+ in[7] = _mm_packs_epi32(u14, u15);
+
+ // stage 2
+ s0 = _mm_add_epi16(in[0], in[2]);
+ s1 = _mm_add_epi16(in[1], in[3]);
+ s2 = _mm_sub_epi16(in[0], in[2]);
+ s3 = _mm_sub_epi16(in[1], in[3]);
+ u0 = _mm_unpacklo_epi16(in[4], in[5]);
+ u1 = _mm_unpackhi_epi16(in[4], in[5]);
+ u2 = _mm_unpacklo_epi16(in[6], in[7]);
+ u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+ v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+ v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+ v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+ v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+ v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+ v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+ v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+ w0 = _mm_add_epi32(v0, v4);
+ w1 = _mm_add_epi32(v1, v5);
+ w2 = _mm_add_epi32(v2, v6);
+ w3 = _mm_add_epi32(v3, v7);
+ w4 = _mm_sub_epi32(v0, v4);
+ w5 = _mm_sub_epi32(v1, v5);
+ w6 = _mm_sub_epi32(v2, v6);
+ w7 = _mm_sub_epi32(v3, v7);
+
+ v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ // back to 16-bit intergers
+ s4 = _mm_packs_epi32(u0, u1);
+ s5 = _mm_packs_epi32(u2, u3);
+ s6 = _mm_packs_epi32(u4, u5);
+ s7 = _mm_packs_epi32(u6, u7);
+
+ // stage 3
+ u0 = _mm_unpacklo_epi16(s2, s3);
+ u1 = _mm_unpackhi_epi16(s2, s3);
+ u2 = _mm_unpacklo_epi16(s6, s7);
+ u3 = _mm_unpackhi_epi16(s6, s7);
+
+ v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+ v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+ v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+ v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+ v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+ v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+ v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+ v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+ u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ s2 = _mm_packs_epi32(v0, v1);
+ s3 = _mm_packs_epi32(v2, v3);
+ s6 = _mm_packs_epi32(v4, v5);
+ s7 = _mm_packs_epi32(v6, v7);
+
+ // FIXME(jingning): do subtract using bit inversion?
+ in[0] = s0;
+ in[1] = _mm_sub_epi16(k__const_0, s4);
+ in[2] = s6;
+ in[3] = _mm_sub_epi16(k__const_0, s2);
+ in[4] = s3;
+ in[5] = _mm_sub_epi16(k__const_0, s7);
+ in[6] = s5;
+ in[7] = _mm_sub_epi16(k__const_0, s1);
+
+ // transpose
+ transpose_16bit_8x8(in, in);
+}
+
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in[8];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride);
+ fadst8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride);
+ fdct8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_8x8(input, in, stride);
+ fadst8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
+ __m128i *in1, int stride) {
+ // load first 8 columns
+ load_buffer_8x8(input, in0, stride);
+ load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+ input += 8;
+ // load second 8 columns
+ load_buffer_8x8(input, in1, stride);
+ load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
+ __m128i *in1, int stride) {
+ // write first 8 columns
+ write_buffer_8x8(output, in0, stride);
+ write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+ // write second 8 columns
+ output += 8;
+ write_buffer_8x8(output, in1, stride);
+ write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
+ // perform rounding operations
+ right_shift_8x8(res0, 2);
+ right_shift_8x8(res0 + 8, 2);
+ right_shift_8x8(res1, 2);
+ right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(__m128i *in) {
+ // perform 16x16 1-D DCT for 8 columns
+ __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+ const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+ const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+ const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+ const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+ const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ // stage 1
+ i[0] = _mm_add_epi16(in[0], in[15]);
+ i[1] = _mm_add_epi16(in[1], in[14]);
+ i[2] = _mm_add_epi16(in[2], in[13]);
+ i[3] = _mm_add_epi16(in[3], in[12]);
+ i[4] = _mm_add_epi16(in[4], in[11]);
+ i[5] = _mm_add_epi16(in[5], in[10]);
+ i[6] = _mm_add_epi16(in[6], in[9]);
+ i[7] = _mm_add_epi16(in[7], in[8]);
+
+ s[0] = _mm_sub_epi16(in[7], in[8]);
+ s[1] = _mm_sub_epi16(in[6], in[9]);
+ s[2] = _mm_sub_epi16(in[5], in[10]);
+ s[3] = _mm_sub_epi16(in[4], in[11]);
+ s[4] = _mm_sub_epi16(in[3], in[12]);
+ s[5] = _mm_sub_epi16(in[2], in[13]);
+ s[6] = _mm_sub_epi16(in[1], in[14]);
+ s[7] = _mm_sub_epi16(in[0], in[15]);
+
+ p[0] = _mm_add_epi16(i[0], i[7]);
+ p[1] = _mm_add_epi16(i[1], i[6]);
+ p[2] = _mm_add_epi16(i[2], i[5]);
+ p[3] = _mm_add_epi16(i[3], i[4]);
+ p[4] = _mm_sub_epi16(i[3], i[4]);
+ p[5] = _mm_sub_epi16(i[2], i[5]);
+ p[6] = _mm_sub_epi16(i[1], i[6]);
+ p[7] = _mm_sub_epi16(i[0], i[7]);
+
+ u[0] = _mm_add_epi16(p[0], p[3]);
+ u[1] = _mm_add_epi16(p[1], p[2]);
+ u[2] = _mm_sub_epi16(p[1], p[2]);
+ u[3] = _mm_sub_epi16(p[0], p[3]);
+
+ v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+ v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+ v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+ v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+ u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+ u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+ u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+ u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+ u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+ u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+ u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+ u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(u[0], u[1]);
+ in[4] = _mm_packs_epi32(u[4], u[5]);
+ in[8] = _mm_packs_epi32(u[2], u[3]);
+ in[12] = _mm_packs_epi32(u[6], u[7]);
+
+ u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+ u[0] = _mm_packs_epi32(v[0], v[1]);
+ u[1] = _mm_packs_epi32(v[2], v[3]);
+
+ t[0] = _mm_add_epi16(p[4], u[0]);
+ t[1] = _mm_sub_epi16(p[4], u[0]);
+ t[2] = _mm_sub_epi16(p[7], u[1]);
+ t[3] = _mm_add_epi16(p[7], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+ u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+ u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+ u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ in[2] = _mm_packs_epi32(v[0], v[1]);
+ in[6] = _mm_packs_epi32(v[4], v[5]);
+ in[10] = _mm_packs_epi32(v[2], v[3]);
+ in[14] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[2] = _mm_packs_epi32(v[0], v[1]);
+ t[3] = _mm_packs_epi32(v[2], v[3]);
+ t[4] = _mm_packs_epi32(v[4], v[5]);
+ t[5] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 3
+ p[0] = _mm_add_epi16(s[0], t[3]);
+ p[1] = _mm_add_epi16(s[1], t[2]);
+ p[2] = _mm_sub_epi16(s[1], t[2]);
+ p[3] = _mm_sub_epi16(s[0], t[3]);
+ p[4] = _mm_sub_epi16(s[7], t[4]);
+ p[5] = _mm_sub_epi16(s[6], t[5]);
+ p[6] = _mm_add_epi16(s[6], t[5]);
+ p[7] = _mm_add_epi16(s[7], t[4]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+ u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+ u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+ u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+ v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+ v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ t[1] = _mm_packs_epi32(v[0], v[1]);
+ t[2] = _mm_packs_epi32(v[2], v[3]);
+ t[5] = _mm_packs_epi32(v[4], v[5]);
+ t[6] = _mm_packs_epi32(v[6], v[7]);
+
+ // stage 5
+ s[0] = _mm_add_epi16(p[0], t[1]);
+ s[1] = _mm_sub_epi16(p[0], t[1]);
+ s[2] = _mm_add_epi16(p[3], t[2]);
+ s[3] = _mm_sub_epi16(p[3], t[2]);
+ s[4] = _mm_sub_epi16(p[4], t[5]);
+ s[5] = _mm_add_epi16(p[4], t[5]);
+ s[6] = _mm_sub_epi16(p[7], t[6]);
+ s[7] = _mm_add_epi16(p[7], t[6]);
+
+ // stage 6
+ u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+ u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+ u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+ u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+ u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+ u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+ u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+ u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+ v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+ v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+ v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+ v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+ v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+ v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+ v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+ v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+ v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+ v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+ v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+ v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[1] = _mm_packs_epi32(v[0], v[1]);
+ in[9] = _mm_packs_epi32(v[2], v[3]);
+ in[5] = _mm_packs_epi32(v[4], v[5]);
+ in[13] = _mm_packs_epi32(v[6], v[7]);
+ in[3] = _mm_packs_epi32(v[8], v[9]);
+ in[11] = _mm_packs_epi32(v[10], v[11]);
+ in[7] = _mm_packs_epi32(v[12], v[13]);
+ in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+static void fadst16_8col(__m128i *in) {
+ // perform 16x16 1-D ADST for 8 columns
+ __m128i s[16], x[16], u[32], v[32];
+ const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i kZero = _mm_setzero_si128();
+
+ u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+ u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+ u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+ u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+ u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+ u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+ u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+ u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+ u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+ u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+ u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+ u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+ u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+ u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+ u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+ u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+ v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+ v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+ v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+ v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+ v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+ v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+ v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+ v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+ v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+ v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+ v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+ v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+ v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+ v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+ v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+ v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+ u[0] = _mm_add_epi32(v[0], v[16]);
+ u[1] = _mm_add_epi32(v[1], v[17]);
+ u[2] = _mm_add_epi32(v[2], v[18]);
+ u[3] = _mm_add_epi32(v[3], v[19]);
+ u[4] = _mm_add_epi32(v[4], v[20]);
+ u[5] = _mm_add_epi32(v[5], v[21]);
+ u[6] = _mm_add_epi32(v[6], v[22]);
+ u[7] = _mm_add_epi32(v[7], v[23]);
+ u[8] = _mm_add_epi32(v[8], v[24]);
+ u[9] = _mm_add_epi32(v[9], v[25]);
+ u[10] = _mm_add_epi32(v[10], v[26]);
+ u[11] = _mm_add_epi32(v[11], v[27]);
+ u[12] = _mm_add_epi32(v[12], v[28]);
+ u[13] = _mm_add_epi32(v[13], v[29]);
+ u[14] = _mm_add_epi32(v[14], v[30]);
+ u[15] = _mm_add_epi32(v[15], v[31]);
+ u[16] = _mm_sub_epi32(v[0], v[16]);
+ u[17] = _mm_sub_epi32(v[1], v[17]);
+ u[18] = _mm_sub_epi32(v[2], v[18]);
+ u[19] = _mm_sub_epi32(v[3], v[19]);
+ u[20] = _mm_sub_epi32(v[4], v[20]);
+ u[21] = _mm_sub_epi32(v[5], v[21]);
+ u[22] = _mm_sub_epi32(v[6], v[22]);
+ u[23] = _mm_sub_epi32(v[7], v[23]);
+ u[24] = _mm_sub_epi32(v[8], v[24]);
+ u[25] = _mm_sub_epi32(v[9], v[25]);
+ u[26] = _mm_sub_epi32(v[10], v[26]);
+ u[27] = _mm_sub_epi32(v[11], v[27]);
+ u[28] = _mm_sub_epi32(v[12], v[28]);
+ u[29] = _mm_sub_epi32(v[13], v[29]);
+ u[30] = _mm_sub_epi32(v[14], v[30]);
+ u[31] = _mm_sub_epi32(v[15], v[31]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+ v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+ v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+ v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+ v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+ v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+ v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+ v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+ v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+ v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+ v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+ v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+ v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+ v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+ v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+ v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+ v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+ u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+ u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+ u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+ u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+ u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+ u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+ u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+ u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+ u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+ u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+ u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+ u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+ u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+ u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+ u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+ u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+ s[0] = _mm_packs_epi32(u[0], u[1]);
+ s[1] = _mm_packs_epi32(u[2], u[3]);
+ s[2] = _mm_packs_epi32(u[4], u[5]);
+ s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[4] = _mm_packs_epi32(u[8], u[9]);
+ s[5] = _mm_packs_epi32(u[10], u[11]);
+ s[6] = _mm_packs_epi32(u[12], u[13]);
+ s[7] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = _mm_packs_epi32(u[16], u[17]);
+ s[9] = _mm_packs_epi32(u[18], u[19]);
+ s[10] = _mm_packs_epi32(u[20], u[21]);
+ s[11] = _mm_packs_epi32(u[22], u[23]);
+ s[12] = _mm_packs_epi32(u[24], u[25]);
+ s[13] = _mm_packs_epi32(u[26], u[27]);
+ s[14] = _mm_packs_epi32(u[28], u[29]);
+ s[15] = _mm_packs_epi32(u[30], u[31]);
+
+ // stage 2
+ u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+ u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+ u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+ u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+ u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+ u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+ u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+ x[0] = _mm_add_epi16(s[0], s[4]);
+ x[1] = _mm_add_epi16(s[1], s[5]);
+ x[2] = _mm_add_epi16(s[2], s[6]);
+ x[3] = _mm_add_epi16(s[3], s[7]);
+ x[4] = _mm_sub_epi16(s[0], s[4]);
+ x[5] = _mm_sub_epi16(s[1], s[5]);
+ x[6] = _mm_sub_epi16(s[2], s[6]);
+ x[7] = _mm_sub_epi16(s[3], s[7]);
+ x[8] = _mm_packs_epi32(u[0], u[1]);
+ x[9] = _mm_packs_epi32(u[2], u[3]);
+ x[10] = _mm_packs_epi32(u[4], u[5]);
+ x[11] = _mm_packs_epi32(u[6], u[7]);
+ x[12] = _mm_packs_epi32(u[8], u[9]);
+ x[13] = _mm_packs_epi32(u[10], u[11]);
+ x[14] = _mm_packs_epi32(u[12], u[13]);
+ x[15] = _mm_packs_epi32(u[14], u[15]);
+
+ // stage 3
+ u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+ u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+ u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+ u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+ u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+ u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+ u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+ u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ s[0] = _mm_add_epi16(x[0], x[2]);
+ s[1] = _mm_add_epi16(x[1], x[3]);
+ s[2] = _mm_sub_epi16(x[0], x[2]);
+ s[3] = _mm_sub_epi16(x[1], x[3]);
+ s[4] = _mm_packs_epi32(v[0], v[1]);
+ s[5] = _mm_packs_epi32(v[2], v[3]);
+ s[6] = _mm_packs_epi32(v[4], v[5]);
+ s[7] = _mm_packs_epi32(v[6], v[7]);
+ s[8] = _mm_add_epi16(x[8], x[10]);
+ s[9] = _mm_add_epi16(x[9], x[11]);
+ s[10] = _mm_sub_epi16(x[8], x[10]);
+ s[11] = _mm_sub_epi16(x[9], x[11]);
+ s[12] = _mm_packs_epi32(v[8], v[9]);
+ s[13] = _mm_packs_epi32(v[10], v[11]);
+ s[14] = _mm_packs_epi32(v[12], v[13]);
+ s[15] = _mm_packs_epi32(v[14], v[15]);
+
+ // stage 4
+ u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+ u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+ u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+ u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+ u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+ u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+ u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+ u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+ v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+ v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+ v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+ v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+ v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+ v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+ v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+ u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+ u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+ u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+ u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+ u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+ u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+ u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+ u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+ v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+ v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+ v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+ v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+ v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+ v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+ v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+ v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+ v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+ in[0] = s[0];
+ in[1] = _mm_sub_epi16(kZero, s[8]);
+ in[2] = s[12];
+ in[3] = _mm_sub_epi16(kZero, s[4]);
+ in[4] = _mm_packs_epi32(v[4], v[5]);
+ in[5] = _mm_packs_epi32(v[12], v[13]);
+ in[6] = _mm_packs_epi32(v[8], v[9]);
+ in[7] = _mm_packs_epi32(v[0], v[1]);
+ in[8] = _mm_packs_epi32(v[2], v[3]);
+ in[9] = _mm_packs_epi32(v[10], v[11]);
+ in[10] = _mm_packs_epi32(v[14], v[15]);
+ in[11] = _mm_packs_epi32(v[6], v[7]);
+ in[12] = s[5];
+ in[13] = _mm_sub_epi16(kZero, s[13]);
+ in[14] = s[9];
+ in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+static void fdct16_sse2(__m128i *in0, __m128i *in1) {
+ fdct16_8col(in0);
+ fdct16_8col(in1);
+ transpose_16bit_16x16(in0, in1);
+}
+
+static void fadst16_sse2(__m128i *in0, __m128i *in1) {
+ fadst16_8col(in0);
+ fadst16_8col(in1);
+ transpose_16bit_16x16(in0, in1);
+}
+
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+ int tx_type) {
+ __m128i in0[16], in1[16];
+
+ switch (tx_type) {
+ case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in0, in1, stride);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in0, in1, stride);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ default:
+ assert(tx_type == ADST_ADST);
+ load_buffer_16x16(input, in0, in1, stride);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
new file mode 100644
index 0000000000..8152dce864
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -0,0 +1,69 @@
+;
+; Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+ STORE_TRAN_LOW 0, outputq, 0, 2, 3
+ STORE_TRAN_LOW 1, outputq, 8, 2, 3
+
+ RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
new file mode 100644
index 0000000000..5930bf491e
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/encoder/vp9_context_tree.h"
+#include "vp9/encoder/vp9_denoiser.h"
+#include "vpx_mem/vpx_mem.h"
+
+// Compute the sum of all pixel differences of this MB.
+static INLINE int sum_diff_16x1(__m128i acc_diff) {
+ const __m128i k_1 = _mm_set1_epi16(1);
+ const __m128i acc_diff_lo =
+ _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_hi =
+ _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+ const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
+ const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
+ const __m128i hgfe_dcba =
+ _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+ const __m128i hgfedcba =
+ _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+ return _mm_cvtsi128_si32(hgfedcba);
+}
+
+// Denoise a 16x1 vector.
+static INLINE __m128i vp9_denoiser_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
+ const __m128i *k_16, const __m128i *l3, const __m128i *l32,
+ const __m128i *l21, __m128i acc_diff) {
+ // Calculate differences
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ __m128i v_running_avg_y;
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
+ // Clamp absolute difference to 16 to be used to get mask. Doing this
+ // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
+ const __m128i clamped_absdiff =
+ _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
+ // Get masks for l2 l1 and l0 adjustments.
+ const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
+ const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
+ const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
+ // Get adjustments for l2, l1, and l0.
+ __m128i adj2 = _mm_and_si128(mask2, *l32);
+ const __m128i adj1 = _mm_and_si128(mask1, *l21);
+ const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+ __m128i adj, padj, nadj;
+
+ // Combine the adjustments and get absolute adjustments.
+ adj2 = _mm_add_epi8(adj2, adj1);
+ adj = _mm_sub_epi8(*l3, adj2);
+ adj = _mm_andnot_si128(mask0, adj);
+ adj = _mm_or_si128(adj, adj0);
+
+ // Restore the sign and get positive and negative adjustments.
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+
+ // Calculate filtered value.
+ v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Adjustments <=7, and each element in acc_diff can fit in signed
+ // char.
+ acc_diff = _mm_adds_epi8(acc_diff, padj);
+ acc_diff = _mm_subs_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise a 16x1 vector with a weaker filter.
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+ const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
+ const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
+ __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
+ // Calculate differences.
+ const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
+ const __m128i v_mc_running_avg_y =
+ _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+ const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+ const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+ // Obtain the sign. FF if diff is negative.
+ const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+ // Clamp absolute difference to delta to get the adjustment.
+ const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+ // Restore the sign and get positive and negative adjustments.
+ __m128i padj, nadj;
+ padj = _mm_andnot_si128(diff_sign, adj);
+ nadj = _mm_and_si128(diff_sign, adj);
+ // Calculate filtered value.
+ v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
+ v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
+ _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
+
+ // Accumulate the adjustments.
+ acc_diff = _mm_subs_epi8(acc_diff, padj);
+ acc_diff = _mm_adds_epi8(acc_diff, nadj);
+ return acc_diff;
+}
+
+// Denoise 8x8 and 8x16 blocks.
+static int vp9_denoiser_NxM_sse2_small(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude, int width) {
+ int sum_diff_thresh, r, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
+ __m128i acc_diff = _mm_setzero_si128();
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
+
+ for (r = 0; r < b_height; ++r) {
+ memcpy(sig_buffer[r], sig, width);
+ memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+ memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+ memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
+ width);
+ memcpy(running_buffer[r], running_avg_y, width);
+ memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+ acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r], mc_running_buffer[r],
+ running_buffer[r], &k_0, &k_4, &k_8,
+ &k_16, &l3, &l32, &l21, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+ // Update pointers for next iteration.
+ sig += (sig_stride << 1);
+ mc_running_avg_y += (mc_avg_y_stride << 1);
+ running_avg_y += (avg_y_stride << 1);
+ }
+
+ {
+ sum_diff = sum_diff_16x1(acc_diff);
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ // Before returning to copy the block (i.e., apply no denoising),
+ // check if we can still apply some (weaker) temporal filtering to
+ // this block, that would otherwise not be denoised at all. Simplest
+ // is to apply an additional adjustment to running_avg_y to bring it
+ // closer to sig. The adjustment is capped by a maximum delta, and
+ // chosen such that in most cases the resulting sum_diff will be
+ // within the acceptable range given by sum_diff_thresh.
+
+ // The delta is set by the excess of absolute pixel diff over the
+ // threshold.
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ running_avg_y -= avg_y_stride * (b_height << 1);
+ for (r = 0; r < b_height; ++r) {
+ acc_diff = vp9_denoiser_adj_16x1_sse2(
+ sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_0,
+ k_delta, acc_diff);
+ memcpy(running_avg_y, running_buffer[r], width);
+ memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width,
+ width);
+ // Update pointers for next iteration.
+ running_avg_y += (avg_y_stride << 1);
+ }
+ sum_diff = sum_diff_16x1(acc_diff);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+// Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_running_avg_y,
+ int mc_avg_y_stride,
+ uint8_t *running_avg_y, int avg_y_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ int sum_diff_thresh, r, c, sum_diff = 0;
+ const int shift_inc =
+ (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
+ ? 1
+ : 0;
+ __m128i acc_diff[4][4];
+ const __m128i k_0 = _mm_setzero_si128();
+ const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
+ const __m128i k_8 = _mm_set1_epi8(8);
+ const __m128i k_16 = _mm_set1_epi8(16);
+ // Modify each level's adjustment according to motion_magnitude.
+ const __m128i l3 = _mm_set1_epi8(
+ (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
+ // Difference between level 3 and level 2 is 2.
+ const __m128i l32 = _mm_set1_epi8(2);
+ // Difference between level 2 and level 1 is 1.
+ const __m128i l21 = _mm_set1_epi8(1);
+ const int b_width = (4 << b_width_log2_lookup[bs]);
+ const int b_height = (4 << b_height_log2_lookup[bs]);
+ const int b_width_shift4 = b_width >> 4;
+
+ for (r = 0; r < 4; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r] = _mm_setzero_si128();
+ }
+ }
+
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] = vp9_denoiser_16x1_sse2(
+ sig, mc_running_avg_y, running_avg_y, &k_0, &k_4, &k_8, &k_16, &l3,
+ &l32, &l21, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+
+ // Update pointers for next iteration.
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+
+ {
+ sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
+ if (abs(sum_diff) > sum_diff_thresh) {
+ const int delta =
+ ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
+
+ // Only apply the adjustment for max delta up to 3.
+ if (delta < 4) {
+ const __m128i k_delta = _mm_set1_epi8(delta);
+ sig -= sig_stride * b_height;
+ mc_running_avg_y -= mc_avg_y_stride * b_height;
+ running_avg_y -= avg_y_stride * b_height;
+ sum_diff = 0;
+ for (r = 0; r < b_height; ++r) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ acc_diff[c][r >> 4] =
+ vp9_denoiser_adj_16x1_sse2(sig, mc_running_avg_y, running_avg_y,
+ k_0, k_delta, acc_diff[c][r >> 4]);
+ // Update pointers for next iteration.
+ sig += 16;
+ mc_running_avg_y += 16;
+ running_avg_y += 16;
+ }
+
+ if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
+ for (c = 0; c < b_width_shift4; ++c) {
+ sum_diff += sum_diff_16x1(acc_diff[c][r >> 4]);
+ }
+ }
+ sig = sig - b_width + sig_stride;
+ mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
+ running_avg_y = running_avg_y - b_width + avg_y_stride;
+ }
+ if (abs(sum_diff) > sum_diff_thresh) {
+ return COPY_BLOCK;
+ }
+ } else {
+ return COPY_BLOCK;
+ }
+ }
+ }
+ return FILTER_BLOCK;
+}
+
+int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
+ const uint8_t *mc_avg, int mc_avg_stride,
+ uint8_t *avg, int avg_stride,
+ int increase_denoising, BLOCK_SIZE bs,
+ int motion_magnitude) {
+ // Rank by frequency of the block type to have an early termination.
+ if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
+ bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
+ bs == BLOCK_32X64 || bs == BLOCK_64X32) {
+ return vp9_denoiser_NxM_sse2_big(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude);
+ } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
+ return vp9_denoiser_NxM_sse2_small(sig, sig_stride, mc_avg, mc_avg_stride,
+ avg, avg_stride, increase_denoising, bs,
+ motion_magnitude, 8);
+ } else {
+ return COPY_BLOCK;
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
new file mode 100644
index 0000000000..80442e3594
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+ int_mv result;
+ result.as_mv.row = row;
+ result.as_mv.col = col;
+ return result;
+}
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables, *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in *
+ * vp9_encoder.c. *
+ * For the joint cost: *
+ * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] *
+ * For the component costs: *
+ * - For all i: mvsadcost[0][i] == mvsadcost[1][i] *
+ * (Equal costs for both components) *
+ * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] *
+ * (Cost function is even) *
+ * If these do not hold, then this function cannot be used without *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties. *
+ *****************************************************************************/
+int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
+ const search_site_config *cfg, MV *ref_mv,
+ uint32_t start_mv_sad, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const vp9_sad_fn_ptr_t *sad_fn_ptr,
+ const MV *center_mv) {
+ const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+ const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
+ const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+ const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
+
+ const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
+
+ const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
+ const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+ const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+ const int tot_steps = cfg->total_steps - search_param;
+
+ const int_mv fcenter_mv =
+ pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+ const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
+
+ const int ref_row = ref_mv->row;
+ const int ref_col = ref_mv->col;
+
+ int_mv bmv = pack_int_mv(ref_row, ref_col);
+ int_mv new_bmv = bmv;
+ __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
+
+ const int what_stride = x->plane[0].src.stride;
+ const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+ const uint8_t *const what = x->plane[0].src.buf;
+ const uint8_t *const in_what =
+ x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+ // Work out the start point for the search
+ const uint8_t *best_address = in_what;
+ const uint8_t *new_best_address = best_address;
+#if VPX_ARCH_X86_64
+ __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+ __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+ // Starting position
+ unsigned int best_sad = start_mv_sad;
+ int i, j, step;
+
+ // Check the prerequisite cost function properties that are easy to check
+ // in an assert. See the function-level documentation for details on all
+ // prerequisites.
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+ assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+ *num00 = 0;
+
+ for (i = 0, step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+ __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
+#if VPX_ARCH_X86_64
+ __m128i v_blocka[2];
+#else
+ __m128i v_blocka[1];
+#endif
+
+ // Compute the candidate motion vectors
+ const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
+ const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
+ // Clamp them to the search bounds
+ __m128i v_these_mv_clamp_w = v_these_mv_w;
+ v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
+ v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
+ // The ones that did not change are inside the search area
+ v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
+
+ // If none of them are inside, then move on
+ if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
+ continue;
+ }
+
+ // The inverse mask indicates which of the MVs are outside
+ v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
+ // Shift right to keep the sign bit clear, we will use this later
+ // to set the cost to the maximum value.
+ v_outside_d = _mm_srli_epi32(v_outside_d, 1);
+
+ // Compute the difference MV
+ v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
+ // We utilise the fact that the cost function is even, and use the
+ // absolute difference. This allows us to use unsigned indexes later
+ // and reduces cache pressure somewhat as only a half of the table
+ // is ever referenced.
+ v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
+
+ // Compute the SIMD pointer offsets.
+ {
+#if VPX_ARCH_X86_64 // sizeof(intptr_t) == 8
+ // Load the offsets
+ __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
+ __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
+ // Set the ones falling outside to zero
+ v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
+ v_bo32_q =
+ _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
+ // Compute the candidate addresses
+ v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
+ v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
+#else // VPX_ARCH_X86 // sizeof(intptr_t) == 4
+ __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
+ v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
+ v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
+#endif
+ }
+
+ sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+ in_what_stride, (uint32_t *)&v_sad_d);
+
+ // Look up the component cost of the residual motion vector
+ {
+ const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
+ const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
+ const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
+ const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
+ const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
+ const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
+ const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
+ const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
+
+ // Note: This is a use case for vpgather in AVX2
+ const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
+ const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
+ const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
+ const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
+
+ __m128i v_cost_10_d, v_cost_32_d;
+ v_cost_10_d = _mm_cvtsi32_si128(cost0);
+ v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
+ v_cost_32_d = _mm_cvtsi32_si128(cost2);
+ v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
+ v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
+ }
+
+ // Now add in the joint cost
+ {
+ const __m128i v_sel_d =
+ _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
+ const __m128i v_joint_cost_d =
+ _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
+ v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
+ }
+
+ // Multiply by sad_per_bit
+ v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
+ // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+ v_cost_d = _mm_add_epi32(v_cost_d,
+ _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
+ v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
+ // Add the cost to the sad
+ v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
+
+ // Make the motion vectors outside the search area have max cost
+ // by or'ing in the comparison mask, this way the minimum search won't
+ // pick them.
+ v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
+
+ // Find the minimum value and index horizontally in v_sad_d
+ {
+ // Try speculatively on 16 bits, so we can use the minpos intrinsic
+ const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
+ const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
+
+ uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
+ uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
+
+ // If the local best value is not saturated, just use it, otherwise
+ // find the horizontal minimum again the hard way on 32 bits.
+ // This is executed rarely.
+ if (UNLIKELY(local_best_sad == 0xffff)) {
+ __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
+
+ v_loval_d = v_sad_d;
+ v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
+ v_hival_d = _mm_srli_si128(v_loval_d, 8);
+ v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
+
+ v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+ v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+ v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+ v_hival_d = _mm_srli_si128(v_loval_d, 4);
+ v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
+
+ v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
+
+ v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
+ v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
+
+ local_best_sad = _mm_extract_epi32(v_loval_d, 0);
+ local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
+ }
+
+ // Update the global minimum if the local minimum is smaller
+ if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+ new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+ new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+ best_sad = local_best_sad;
+ }
+ }
+ }
+
+ bmv = new_bmv;
+ best_address = new_best_address;
+
+ v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
+#if VPX_ARCH_X86_64
+ v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
+#else
+ v_ba_d = _mm_set1_epi32((intptr_t)best_address);
+#endif
+
+ if (UNLIKELY(best_address == in_what)) {
+ (*num00)++;
+ }
+ }
+
+ *best_mv = bmv.as_mv;
+ return best_sad;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
new file mode 100644
index 0000000000..99fef31d16
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+
+int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_256, ssz_256;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_hi, ssz_hi;
+ __m128i sse_128, ssz_128;
+ int64_t sse;
+ const __m256i zero = _mm256_setzero_si256();
+
+ // If the block size is 16 then the results will fit in 32 bits.
+ if (block_size == 16) {
+ __m256i coeff_256, dqcoeff_256, coeff_hi, dqcoeff_hi;
+ // Load 16 elements for coeff and dqcoeff.
+ coeff_256 = load_tran_low(coeff);
+ dqcoeff_256 = load_tran_low(dqcoeff);
+ // dqcoeff - coeff
+ dqcoeff_256 = _mm256_sub_epi16(dqcoeff_256, coeff_256);
+ // madd (dqcoeff - coeff)
+ dqcoeff_256 = _mm256_madd_epi16(dqcoeff_256, dqcoeff_256);
+ // madd coeff
+ coeff_256 = _mm256_madd_epi16(coeff_256, coeff_256);
+ // Save the higher 64 bit of each 128 bit lane.
+ dqcoeff_hi = _mm256_srli_si256(dqcoeff_256, 8);
+ coeff_hi = _mm256_srli_si256(coeff_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ dqcoeff_256 = _mm256_add_epi32(dqcoeff_256, dqcoeff_hi);
+ coeff_256 = _mm256_add_epi32(coeff_256, coeff_hi);
+ // Expand each double word in the lower 64 bits to quad word.
+ sse_256 = _mm256_unpacklo_epi32(dqcoeff_256, zero);
+ ssz_256 = _mm256_unpacklo_epi32(coeff_256, zero);
+ } else {
+ int i;
+ assert(block_size % 32 == 0);
+ sse_256 = zero;
+ ssz_256 = zero;
+
+ for (i = 0; i < block_size; i += 32) {
+ __m256i coeff_0, coeff_1, dqcoeff_0, dqcoeff_1;
+ // Load 32 elements for coeff and dqcoeff.
+ coeff_0 = load_tran_low(coeff + i);
+ dqcoeff_0 = load_tran_low(dqcoeff + i);
+ coeff_1 = load_tran_low(coeff + i + 16);
+ dqcoeff_1 = load_tran_low(dqcoeff + i + 16);
+ // dqcoeff - coeff
+ dqcoeff_0 = _mm256_sub_epi16(dqcoeff_0, coeff_0);
+ dqcoeff_1 = _mm256_sub_epi16(dqcoeff_1, coeff_1);
+ // madd (dqcoeff - coeff)
+ dqcoeff_0 = _mm256_madd_epi16(dqcoeff_0, dqcoeff_0);
+ dqcoeff_1 = _mm256_madd_epi16(dqcoeff_1, dqcoeff_1);
+ // madd coeff
+ coeff_0 = _mm256_madd_epi16(coeff_0, coeff_0);
+ coeff_1 = _mm256_madd_epi16(coeff_1, coeff_1);
+ // Add the first madd (dqcoeff - coeff) with the second.
+ dqcoeff_0 = _mm256_add_epi32(dqcoeff_0, dqcoeff_1);
+ // Add the first madd (coeff) with the second.
+ coeff_0 = _mm256_add_epi32(coeff_0, coeff_1);
+ // Expand each double word of madd (dqcoeff - coeff) to quad word.
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_0, zero);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_0, zero);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_0, zero);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_0, zero);
+ // Add each quad word of madd (dqcoeff - coeff) and madd (coeff).
+ sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_lo);
+ ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_lo);
+ sse_256 = _mm256_add_epi64(sse_256, exp_dqcoeff_hi);
+ ssz_256 = _mm256_add_epi64(ssz_256, exp_coeff_hi);
+ }
+ }
+ // Save the higher 64 bit of each 128 bit lane.
+ sse_hi = _mm256_srli_si256(sse_256, 8);
+ ssz_hi = _mm256_srli_si256(ssz_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+ ssz_256 = _mm256_add_epi64(ssz_256, ssz_hi);
+
+ // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+ sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ ssz_128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_256),
+ _mm256_extractf128_si256(ssz_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)(&sse), sse_128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_128);
+ return sse;
+}
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, int block_size) {
+ int i;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i sse_256 = zero;
+ __m256i sse_hi;
+ __m128i sse_128;
+ int64_t sse;
+
+ if (block_size == 16) {
+ // Load 16 elements for coeff and dqcoeff.
+ const __m256i _coeff = load_tran_low(coeff);
+ const __m256i _dqcoeff = load_tran_low(dqcoeff);
+ // dqcoeff - coeff
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ // madd (dqcoeff - coeff)
+ const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+ // Save the higher 64 bit of each 128 bit lane.
+ const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+ // Expand each double word in the lower 64 bits to quad word.
+ sse_256 = _mm256_unpacklo_epi32(error, zero);
+ } else {
+ for (i = 0; i < block_size; i += 16) {
+ // Load 16 elements for coeff and dqcoeff.
+ const __m256i _coeff = load_tran_low(coeff);
+ const __m256i _dqcoeff = load_tran_low(dqcoeff);
+ const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+ const __m256i error = _mm256_madd_epi16(diff, diff);
+ // Expand each double word of madd (dqcoeff - coeff) to quad word.
+ const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+ const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+ // Add each quad word of madd (dqcoeff - coeff).
+ sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+ sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+ coeff += 16;
+ dqcoeff += 16;
+ }
+ }
+ // Save the higher 64 bit of each 128 bit lane.
+ sse_hi = _mm256_srli_si256(sse_256, 8);
+ // Add the higher 64 bit to the low 64 bit.
+ sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+ // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+ sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+ _mm256_extractf128_si256(sse_256, 1));
+
+ // Store the results.
+ _mm_storel_epi64((__m128i *)&sse, sse_128);
+ return sse;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 0000000000..7beec130ab
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,115 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%define private_prefix vp9
+
+%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
+
+SECTION .text
+
+; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+.loop:
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ paddd m2, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m0
+ punpckhdq m2, m5
+ paddq m6, m7
+ paddq m6, m2
+ jg .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if VPX_ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
+
+; Compute the sum of squared difference between two tran_low_t vectors.
+; Vectors are converted (if necessary) to int16_t for calculations.
+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
+; intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+ pxor m4, m4 ; sse accumulator
+ pxor m5, m5 ; dedicated zero register
+.loop:
+ LOAD_TRAN_LOW 2, uqcq, 0
+ LOAD_TRAN_LOW 0, dqcq, 0
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
+ INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
+ INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
+ sub sizeq, 16
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
+ paddd m0, m1
+ ; accumulate in 64bit
+ punpckldq m3, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m3
+ paddq m4, m0
+ jnz .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if VPX_ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
new file mode 100644
index 0000000000..94506aad0f
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h> // SSSE3
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_scale/yv12config.h"
+
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+ const uint8_t *const src, const __m128i *const mask) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+ const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+ const __m128i a_and = _mm_and_si128(a, *mask);
+ const __m128i b_and = _mm_and_si128(b, *mask);
+ return _mm_packus_epi16(a_and, b_and);
+}
+
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi16(0x00FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+ _mm_storeu_si128((__m128i *)dst, d);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h) {
+ const int max_width = (dst_w + 15) & ~15;
+ const __m128i mask = _mm_set1_epi32(0x000000FF);
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+ const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+ const __m128i d2 = _mm_packus_epi16(d0, d1);
+ _mm_storeu_si128((__m128i *)dst, d2);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+ const __m128i c0c1) {
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+ const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+ // round and shift by 7 bit each 16 bit
+ const __m128i t2 = _mm_adds_epi16(t0, k_64);
+ const __m128i t3 = _mm_adds_epi16(t1, k_64);
+ const __m128i t4 = _mm_srai_epi16(t2, 7);
+ const __m128i t5 = _mm_srai_epi16(t3, 7);
+ return _mm_packus_epi16(t4, t5);
+}
+
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[2], d[2];
+
+ // Horizontal
+ // Even rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // odd rows
+ s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+ // Vertical
+ s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+ d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 32;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 2 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int dst_w, const int dst_h,
+ const __m128i c0c1) {
+ const int max_width = (dst_w + 15) & ~15;
+ int y = dst_h;
+
+ do {
+ int x = max_width;
+ do {
+ __m128i s[8], d[8];
+
+ // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+ // Here we tried to not use shuffle instructions which would be slow
+ // on some x86 CPUs.
+
+ // Horizontal
+ // 000 001 xx xx 004 005 xx xx 008 009 xx xx 00C 00D xx xx
+ // 010 011 xx xx 014 015 xx xx 018 019 xx xx 01C 01D xx xx
+ // 020 021 xx xx 024 025 xx xx 028 029 xx xx 02C 02D xx xx
+ // 030 031 xx xx 034 035 xx xx 038 039 xx xx 03C 03D xx xx
+ // 100 101 xx xx 104 105 xx xx 108 109 xx xx 10C 10D xx xx
+ // 110 111 xx xx 114 115 xx xx 118 119 xx xx 11C 11D xx xx
+ // 120 121 xx xx 124 125 xx xx 128 129 xx xx 12C 12D xx xx
+ // 130 131 xx xx 134 135 xx xx 138 139 xx xx 13C 13D xx xx
+ s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+ s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+ s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+ s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+ s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+ s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+ s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+ s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+ // 000 001 100 101 xx xx xx xx 004 005 104 105 xx xx xx xx
+ // 008 009 108 109 xx xx xx xx 00C 00D 10C 10D xx xx xx xx
+ // 010 011 110 111 xx xx xx xx 014 015 114 115 xx xx xx xx
+ // 018 019 118 119 xx xx xx xx 01C 01D 11C 11D xx xx xx xx
+ // 020 021 120 121 xx xx xx xx 024 025 124 125 xx xx xx xx
+ // 028 029 128 129 xx xx xx xx 02C 02D 12C 12D xx xx xx xx
+ // 030 031 130 131 xx xx xx xx 034 035 134 135 xx xx xx xx
+ // 038 039 138 139 xx xx xx xx 03C 03D 13C 13D xx xx xx xx
+ d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+ d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+ d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+ d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+ d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+ d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+ d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+ d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+ // 000 001 100 101 008 009 108 109 xx xx xx xx xx xx xx xx
+ // 004 005 104 105 00C 00D 10C 10D xx xx xx xx xx xx xx xx
+ // 010 011 110 111 018 019 118 119 xx xx xx xx xx xx xx xx
+ // 014 015 114 115 01C 01D 11C 11D xx xx xx xx xx xx xx xx
+ // 020 021 120 121 028 029 128 129 xx xx xx xx xx xx xx xx
+ // 024 025 124 125 02C 02D 12C 12D xx xx xx xx xx xx xx xx
+ // 030 031 130 131 038 039 138 139 xx xx xx xx xx xx xx xx
+ // 034 035 134 135 03C 03D 13C 13D xx xx xx xx xx xx xx xx
+ s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+ s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+ s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+ s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+ s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+ s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+ // 000 001 100 101 004 005 104 105 008 009 108 109 00C 00D 10C 10D
+ // 010 011 110 111 014 015 114 115 018 019 118 119 01C 01D 11C 11D
+ // 020 021 120 121 024 025 124 125 028 029 128 129 02C 02D 12C 12D
+ // 030 031 130 131 034 035 134 135 038 039 138 139 03C 03D 13C 13D
+ d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+ d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+ d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+ d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+ d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+ d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+ // Vertical
+ d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+ _mm_storeu_si128((__m128i *)dst, d[0]);
+ src += 64;
+ dst += 16;
+ x -= 16;
+ } while (x);
+ src += 4 * (src_stride - max_width);
+ dst += dst_stride - max_width;
+ } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 3) & ~3;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 3) & ~3;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+ // horizontal 4x8
+ do {
+ load_8bit_8x8(src + 2, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[3]);
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ transpose_16bit_4x8(&s[3], &s[3]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[1], f); // 01 11 21 31 41 51 61 71
+ d[2] = convolve8_8_ssse3(&s[2], f); // 02 12 22 32 42 52 62 72
+ d[3] = convolve8_8_ssse3(&s[3], f); // 03 13 23 33 43 53 63 73
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ d[0] = _mm_packus_epi16(d[0], d[2]);
+ d[1] = _mm_packus_epi16(d[1], d[3]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+ d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+ d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ t += 8;
+ x -= 4;
+ } while (x);
+ src += 8 * src_stride - 2 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x4
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+ t += 6 * width_hor;
+ y = height_ver;
+
+ do {
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[1], f); // 10 11 12 13 14 15 16 17
+ d[2] = convolve8_8_ssse3(&s[2], f); // 20 21 22 23 24 25 26 27
+ d[3] = convolve8_8_ssse3(&s[3], f); // 30 31 32 33 34 35 36 37
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[1] = _mm_packus_epi16(d[2], d[3]);
+ store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+
+ dst += 4 * dst_stride;
+ y -= 4;
+ } while (y);
+ t -= width_hor * (2 * height_ver + 6);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ const int width_hor = (w + 1) & ~1;
+ const int width_ver = (w + 7) & ~7;
+ const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+ const int height_ver = (h + 1) & ~1;
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[11], d[4];
+ __m128i f[4];
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef, f);
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+ // horizontal 2x8
+ do {
+ load_8bit_8x8(src + 4, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 (overlapped)
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[2]);
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ transpose_16bit_4x8(&s[2], &s[2]);
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 10 20 30 40 50 60 70
+ d[1] = convolve8_8_ssse3(&s[2], f); // 01 11 21 31 41 51 61 71
+
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ d[0] = _mm_packus_epi16(d[0], d[0]);
+ d[1] = _mm_packus_epi16(d[1], d[1]);
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+ store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ t += 4;
+ x -= 2;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor;
+ t += 6 * width_hor;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x2
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+ s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+ t += 4 * width_hor;
+ y = height_ver;
+
+ do {
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
+ loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+ t += 8 * width_hor;
+
+ d[0] = convolve8_8_ssse3(&s[0], f); // 00 01 02 03 04 05 06 07
+ d[1] = convolve8_8_ssse3(&s[2], f); // 10 11 12 13 14 15 16 17
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+
+ dst += 2 * dst_stride;
+ y -= 2;
+ } while (y);
+ t -= width_hor * (4 * height_ver + 4);
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+ __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+ const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[12], d[6], dd[4];
+ __m128i f0[4], f1[5], f2[5];
+ // The offset of the first row is always less than 1 pixel.
+ const int offset1_q4 = phase_scaler + 1 * step_q4;
+ const int offset2_q4 = phase_scaler + 2 * step_q4;
+ // offset_idxx indicates the pixel offset is even (0) or odd (1).
+ // It's used to choose the src offset and filter coefficient offset.
+ const int offset_idx1 = (offset1_q4 >> 4) & 1;
+ const int offset_idx2 = (offset2_q4 >> 4) & 1;
+ static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
+ shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+ };
+ static const convolve8_funcs kConvolve8Funcs[2] = {
+ convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+ };
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+ kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+ // Sub 64 to avoid overflow.
+ // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+ // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+ // When filter phase idx is 1, the two biggest coefficients are shuffled
+ // together, and the sum of them are always no less than 128. Sub 64 here.
+ // After the subtraction, when the sum of all positive coefficients are no
+ // larger than 128, and the sum of all negative coefficients are no
+ // less than -128, there will be no overflow in the convolve8 functions.
+ f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+ f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+ f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+ // horizontal 6x8
+ do {
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[4]);
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
+ transpose_16bit_4x8(&s[4], &s[4]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ dd[0] = _mm_packus_epi16(d[0], d[2]);
+ dd[1] = _mm_packus_epi16(d[1], d[3]);
+ dd[2] = _mm_packus_epi16(d[4], d[4]);
+ dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
+ d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+ d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+ d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
+ // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
+ dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+ dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+ // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+ // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
+ // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
+ d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+ d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+ d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+ d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+ // store 4 extra pixels
+ storeu_8bit_16x4(d, t, stride_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ t += 12;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 3 * stride_hor + 4;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ loadu_8bit_16x4(t, stride_hor, s);
+ y = height_ver;
+
+ do {
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
+ // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
+ t += 4 * stride_hor;
+ loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[2] = _mm_packus_epi16(d[2], d[3]);
+ d[4] = _mm_packus_epi16(d[4], d[5]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+ _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+ _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * 2 * height_ver / 3;
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+ const __m128i *const f) {
+ __m128i ss[4], temp;
+
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ temp = convolve8_8_ssse3(ss, f);
+ return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+ const int w, const __m128i *const f) {
+ int x = w;
+
+ do {
+ __m128i s[8], temp;
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+ s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+ s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+ s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+ s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+ s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+ s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+ temp = scale_1_to_2_phase_0_kernel(s, f);
+ _mm_storel_epi64((__m128i *)dst, temp);
+ src += 8;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const int src_w, const int src_h,
+ const int16_t *const coef,
+ uint8_t *const temp_buffer) {
+ int max_width;
+ int y;
+ uint8_t *tmp[9];
+ __m128i f[4];
+
+ max_width = (src_w + 7) & ~7;
+ tmp[0] = temp_buffer + 0 * max_width;
+ tmp[1] = temp_buffer + 1 * max_width;
+ tmp[2] = temp_buffer + 2 * max_width;
+ tmp[3] = temp_buffer + 3 * max_width;
+ tmp[4] = temp_buffer + 4 * max_width;
+ tmp[5] = temp_buffer + 5 * max_width;
+ tmp[6] = temp_buffer + 6 * max_width;
+ tmp[7] = temp_buffer + 7 * max_width;
+
+ shuffle_filter_ssse3(coef, f);
+
+ scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+ scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+ scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+ scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+ scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+ scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+ scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+ y = src_h;
+ do {
+ int x;
+ scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+ for (x = 0; x < max_width; x += 8) {
+ __m128i s[8], C, D, CD;
+
+ // Even rows
+ const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+ const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ const __m128i ab = _mm_unpacklo_epi8(a, b);
+ _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+ // Odd rows
+ // Even columns
+ load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+ C = scale_1_to_2_phase_0_kernel(s, f);
+
+ // Odd columns
+ s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+ s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+ s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+ s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+ s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+ s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+ s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+ s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+ D = scale_1_to_2_phase_0_kernel(s, f);
+
+ CD = _mm_unpacklo_epi8(C, D);
+ _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
+ }
+
+ src += src_stride;
+ dst += 2 * dst_stride;
+ tmp[8] = tmp[0];
+ tmp[0] = tmp[1];
+ tmp[1] = tmp[2];
+ tmp[2] = tmp[3];
+ tmp[3] = tmp[4];
+ tmp[4] = tmp[5];
+ tmp[5] = tmp[6];
+ tmp[6] = tmp[7];
+ tmp[7] = tmp[8];
+ } while (--y);
+}
+
+void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst,
+ uint8_t filter_type, int phase_scaler) {
+ const int src_w = src->y_crop_width;
+ const int src_h = src->y_crop_height;
+ const int dst_w = dst->y_crop_width;
+ const int dst_h = dst->y_crop_height;
+ const int dst_uv_w = dst->uv_crop_width;
+ const int dst_uv_h = dst->uv_crop_height;
+ int scaled = 0;
+
+ // phase_scaler is usually 0 or 8.
+ assert(phase_scaler >= 0 && phase_scaler < 16);
+
+ if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+ // 2 to 1
+ scaled = 1;
+
+ if (phase_scaler == 0) {
+ scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0c1);
+ scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_w + 3) & ~3;
+ const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height);
+ if (temp_buffer) {
+ scale_plane_2_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_2_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_2_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+ // 4 to 1
+ scaled = 1;
+ if (phase_scaler == 0) {
+ scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h);
+ scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h);
+ } else if (filter_type == BILINEAR) {
+ const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+ const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+ const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8)); // c0 and c1 >= 0
+ scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, dst_w, dst_h, c0c1);
+ scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+ } else {
+ const int buffer_stride = (dst_w + 1) & ~1;
+ const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+ // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+ const int extra_padding = 16;
+ uint8_t *const temp_buffer =
+ (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+ if (temp_buffer) {
+ scale_plane_4_to_1_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+ scale_plane_4_to_1_general(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ scale_plane_4_to_1_general(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+ temp_buffer);
+ free(temp_buffer);
+ } else {
+ scaled = 0;
+ }
+ }
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+ const int buffer_stride_ver = (dst_w + 7) & ~7;
+ const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ // When the vertical filter reads more pixels than the horizontal filter
+ // generated in each row, we need extra padding to avoid heap read overflow.
+ // For example, the horizontal filter generates 18 pixels but the vertical
+ // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+ // two rows are interlaced together in the optimization.
+ const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+ ? 2 * (buffer_stride_ver - buffer_stride_hor)
+ : 0;
+ const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+ uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+ if (temp_buffer) {
+ scaled = 1;
+ scale_plane_4_to_3_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ free(temp_buffer);
+ }
+ } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+ // 1 to 2
+ uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+ if (temp_buffer) {
+ scaled = 1;
+ scale_plane_1_to_2_phase_0(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+ src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+ scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src_w / 2, src_h / 2,
+ vp9_filter_kernels[filter_type][8],
+ temp_buffer);
+ scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src_w / 2, src_h / 2,
+ vp9_filter_kernels[filter_type][8],
+ temp_buffer);
+ free(temp_buffer);
+ }
+ }
+
+ if (scaled) {
+ vpx_extend_frame_borders(dst);
+ } else {
+ // Call c version for all other scaling ratios.
+ vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+ }
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..d7aafe7b01
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz, int bd) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bd - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32((int32_t)0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 0000000000..e6aa71d58a
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h> // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+// Zero fill 8 positions in the output buffer.
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(a), zero);
+ _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+ _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+ const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+ __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+ __m256i v_eobmax,
+ __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i v_iscan = _mm256_permute4x64_epi64(
+ _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+ const __m256i eob_lo = eob256;
+ // Copy upper 128 to lower 128
+ const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+ __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+ __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+ return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const int32_t nzflag =
+ _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+ if (nzflag) {
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ } else {
+ store_zero_tran_low(qcoeff_ptr);
+ store_zero_tran_low(dqcoeff_ptr);
+ }
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_setzero_si256();
+
+ quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+
+ n_coeffs += 8 * 2;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_srai_epi16(dequant, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += 8 * 2;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i abs_dqcoeff =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+ const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+ const __m256i coeff = load_tran_low(coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+ const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+ if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+ const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+ const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+ const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+ const __m256i abs_dqcoeff =
+ _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+ const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+ const __m256i nz_mask =
+ _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+ store_tran_low(qcoeff, qcoeff_ptr);
+ store_tran_low(dqcoeff, dqcoeff_ptr);
+
+ *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+ } else {
+ store_zero_tran_low(qcoeff_ptr);
+ store_zero_tran_low(dqcoeff_ptr);
+ }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_srli_epi16(dequant, 2);
+ quant = _mm256_slli_epi16(quant, 1);
+ {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+ round = _mm256_add_epi16(round, rnd);
+ round = _mm256_srai_epi16(round, 1);
+ }
+
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask.
+ thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+ quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+ quantize_fp_32x32_16_no_nzflag(
+ &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+ qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+ n_coeffs += 8 * 2;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += 8 * 2;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+ const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(v, zero);
+ const __m128i ac = _mm_unpackhi_epi16(v, zero);
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+ const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+ __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+ *round = highbd_init_256(round_ptr);
+ *quant = highbd_init_256(quant_ptr);
+ *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+ const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+ const __m256i packed_nz_mask =
+ _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+ *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const int step = 8;
+ __m256i round, quant, dequant;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+
+ highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+
+ n_coeffs += step;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += step;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+ const __m256i *round, const __m256i *quant, const __m256i *dequant,
+ const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+ const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+ _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+ *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const int step = 8;
+ __m256i round, quant, dequant, thr;
+ __m256i eob_max = _mm256_setzero_si256();
+ (void)scan;
+
+ coeff_ptr += n_coeffs;
+ iscan += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ // Setup global values
+ highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+ &dequant);
+ thr = _mm256_srli_epi32(dequant, 2);
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+ quant = _mm256_slli_epi32(quant, 1);
+ round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+ highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+ iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+ dqcoeff_ptr + n_coeffs, &eob_max);
+
+ n_coeffs += step;
+
+ // remove dc constants
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+ round = _mm256_permute2x128_si256(round, round, 0x31);
+ thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ highbd_quantize_fp_32x32(
+ &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+ qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+ n_coeffs += step;
+ }
+
+ *eob_ptr = get_max_eob(eob_max);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
new file mode 100644
index 0000000000..c877234436
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i thr;
+ int nzflag;
+ int index = 16;
+ __m128i round, quant, dequant;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
+
+ (void)scan;
+
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ __m128i eob0;
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
+ }
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 0000000000..d35004e370
--- /dev/null
+++ b/media/libvpx/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i thr;
+ int nzflag;
+ int index = 16;
+ __m128i round, quant, dequant;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
+
+ (void)scan;
+
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ thr = _mm_srai_epi16(dequant, 1);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ __m128i eob0;
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
+ }
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one_s16 = _mm_set1_epi16(1);
+ __m128i thr;
+ int nzflag;
+ int index = 16;
+ __m128i round, quant, dequant;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i eob;
+
+ (void)scan;
+
+ // Setup global values.
+ load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+ // The 32x32 halves round.
+ round = _mm_add_epi16(round, one_s16);
+ round = _mm_srli_epi16(round, 1);
+
+ // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+ // upshift quant to account for this.
+ quant = _mm_slli_epi16(quant, 1);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ // Get the abs value of qcoeff again so we can use shifts for division.
+ qcoeff0 = _mm_abs_epi16(qcoeff0);
+ qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ // Divide by 2.
+ qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+ qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr);
+ store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+ thr = _mm_srai_epi16(dequant, 2);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+ _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+ qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+ qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ // Get the abs value of qcoeff again so we can use shifts for division.
+ qcoeff0 = _mm_abs_epi16(qcoeff0);
+ qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+ qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+ qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+ // Divide by 2.
+ qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+ qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ store_tran_low(qcoeff0, dqcoeff_ptr + index);
+ store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+ } else {
+ store_zero_tran_low(qcoeff_ptr + index);
+ store_zero_tran_low(qcoeff_ptr + index + 8);
+
+ store_zero_tran_low(dqcoeff_ptr + index);
+ store_zero_tran_low(dqcoeff_ptr + index + 8);
+ }
+
+ if (nzflag) {
+ const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}