Merge tag 'v3.2.0' into HEAD Create the signed v3.2.0 tag BUG=aomedia:3165 Change-Id: I619563ba6d72d994d749242f66ab9befcc71a200

commit: df61427b1bcf6f6beff7d1293895633929af6a76 [log] [tgz]
author: Wan-Teh Chang <wtc@google.com> Thu Oct 14 09:29:48 2021 -0700
committer: Wan-Teh Chang <wtc@google.com> Thu Oct 14 09:29:48 2021 -0700
tree: fdca9950ba0c1171fc702326f51e0f18377470e3
parent: af43e2b3aab5d9f6de554698eb62aa1998cbaf95 [diff]
parent: 287164de79516c25c8c84fd544f67752c170082a [diff]
diff --git a/aom/aom_external_partition.h b/aom/aom_external_partition.h
index 1bb31c4..34503b5 100644
--- a/aom/aom_external_partition.h
+++ b/aom/aom_external_partition.h

@@ -30,7 +30,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define AOM_EXT_PART_ABI_VERSION 3
+#define AOM_EXT_PART_ABI_VERSION 5
 
 #ifdef __cplusplus
 extern "C" {
@@ -240,6 +240,18 @@
   int frame_width;                ///< Frame width
   int frame_height;               ///< Frame height
   int block_size;                 ///< As "BLOCK_SIZE" in av1/common/enums.h
+  /*!
+   * Valid partition types. A bitmask is used.  "1" represents the
+   * corresponding type is vaild. The bitmask follows the enum order for
+   * PARTITION_TYPE in "enums.h" to represent one partition type at a bit.
+   * For example, 0x01 stands for only PARTITION_NONE is valid,
+   * 0x09 (00...001001) stands for PARTITION_NONE and PARTITION_SPLIT are valid.
+   */
+  int valid_partition_types;
+  int update_type;    ///< Frame update type, defined in ratectrl.h
+  int qindex;         ///< Quantization index, range: [0, 255]
+  int rdmult;         ///< Rate-distortion multiplier
+  int pyramid_level;  ///< The level of this frame in the hierarchical structure
 } aom_partition_features_t;
 
 /*!\brief Partition decisions received from the external model.

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ba6027c..b39bfaa 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -1113,7 +1113,7 @@
   specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
 
   add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+  specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
 
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -1127,10 +1127,10 @@
     specialize qw/aom_highbd_hadamard_32x32 avx2/;
   }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd neon avx2/;
+  specialize qw/aom_satd neon sse2 avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
-  specialize qw/aom_satd_lp avx2 neon/;
+  specialize qw/aom_satd_lp sse2 avx2 neon/;
 
 
   #

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 260ca2a..a52abd0 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c

@@ -272,8 +272,8 @@
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
-void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                              int16_t *coeff) {
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+                                        ptrdiff_t src_stride, int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -304,6 +304,50 @@
   _mm_store_si128((__m128i *)coeff, src[7]);
 }
 
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    _mm_store_si128((__m128i *)t_coeff, coeff0);
+    _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+    _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+    t_coeff += 8;
+  }
+}
+
 static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
@@ -416,17 +460,50 @@
 int aom_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
   __m128i accum = zero;
 
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = load_tran_low(coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
+  for (i = 0; i < length; i += 16) {
+    const __m128i src_line0 = load_tran_low(coeff);
+    const __m128i src_line1 = load_tran_low(coeff + 8);
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i accum = zero;
+
+  for (int i = 0; i < length; i += 16) {
+    const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
   }
 
   {  // cascading summation of accum

diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index 376cb74..c0e0361 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h

@@ -29,6 +29,7 @@
 #define AOM_INTERP_EXTEND 4
 #define AOM_BORDER_IN_PIXELS 288
 #define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_ALLINTRA_BORDER 64
 #define AOM_DEC_BORDER_IN_PIXELS 64
 
 /*!\endcond */

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 089dea4..bcffa76 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake

@@ -392,13 +392,14 @@
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
 if(CONFIG_AV1_TEMPORAL_DENOISING)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index 5e01eef..4764ed5 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c

@@ -1110,6 +1110,10 @@
   q_cfg->deltaq_mode = extra_cfg->deltaq_mode;
   q_cfg->use_fixed_qp_offsets =
       cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q);
+  q_cfg->enable_hdr_deltaq =
+      (q_cfg->deltaq_mode == DELTA_Q_HDR) &&
+      (cfg->g_bit_depth == AOM_BITS_10) &&
+      (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020);
   for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
     if (q_cfg->use_fixed_qp_offsets) {
       if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
@@ -1130,7 +1134,10 @@
   // Set cost update frequency configuration.
   oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
   oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
-  oxcf->cost_upd_freq.mv = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
+  // Avoid MV cost update for allintra encoding mode.
+  oxcf->cost_upd_freq.mv = (cfg->kf_max_dist != 0)
+                               ? (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq
+                               : COST_UPD_OFF;
   oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq;
 
   // Set frame resize mode configuration.
@@ -1364,10 +1371,14 @@
   oxcf->unit_test_cfg.sb_multipass_unit_test =
       extra_cfg->sb_multipass_unit_test;
 
+  // For allintra encoding mode, inter-frame motion search is not applicable and
+  // the intraBC motion vectors are restricted within the tile boundaries. Hence
+  // a smaller frame border size (AOM_ENC_ALLINTRA_BORDER) is used in this case.
   oxcf->border_in_pixels =
       (resize_cfg->resize_mode || superres_cfg->superres_mode)
           ? AOM_BORDER_IN_PIXELS
-          : AOM_ENC_NO_SCALE_BORDER;
+          : (oxcf->kf_cfg.key_freq_max == 0) ? AOM_ENC_ALLINTRA_BORDER
+                                             : AOM_ENC_NO_SCALE_BORDER;
   memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
@@ -2701,7 +2712,7 @@
 #endif
 
   // Handle fixed keyframe intervals
-  if (is_stat_generation_stage(ppi->cpi)) {
+  if (is_stat_generation_stage(ppi->cpi) || is_one_pass_rt_params(ppi->cpi)) {
     if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
         ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
       if (ppi->cpi->common.spatial_layer_id == 0 &&

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 6ea67e1..a4e1bbf 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -325,13 +325,13 @@
   specialize qw/av1_block_error sse2 avx2 neon/;
 
   add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
-  specialize qw/av1_block_error_lp avx2 neon/;
+  specialize qw/av1_block_error_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/av1_quantize_lp avx2 neon/;
+  specialize qw/av1_quantize_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp_32x32 neon avx2/;

diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
index 780ef1c..84fa763 100644
--- a/av1/encoder/allintra_vis.c
+++ b/av1/encoder/allintra_vis.c

@@ -357,20 +357,6 @@
       weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
       weber_stats->satd = best_intra_cost;
 
-      double reg = sqrt((double)weber_stats->distortion) *
-                   sqrt((double)weber_stats->src_pix_max) * 0.1;
-      double alpha_den = fabs(weber_stats->rec_pix_max *
-                                  sqrt((double)weber_stats->src_variance) -
-                              weber_stats->src_pix_max *
-                                  sqrt((double)weber_stats->rec_variance)) +
-                         reg;
-      double alpha_num = ((double)weber_stats->distortion) *
-                             sqrt((double)weber_stats->src_variance) *
-                             weber_stats->rec_pix_max +
-                         reg;
-
-      weber_stats->alpha = AOMMAX(alpha_num, 1.0) / AOMMAX(alpha_den, 1.0);
-
       qcoeff[0] = 0;
       for (idx = 1; idx < coeff_count; ++idx) qcoeff[idx] = abs(qcoeff[idx]);
       qsort(qcoeff, coeff_count, sizeof(*coeff), qsort_comp);
@@ -408,8 +394,8 @@
             get_var_perceptual_ai(cpi, cm->seq_params->sb_size, mi_row, mi_col);
 
         double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
-        double min_max_scale =
-            AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+        double min_max_scale = AOMMAX(
+            1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
         beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
         beta = AOMMIN(beta, 4);
         beta = AOMMAX(beta, 0.25);
@@ -468,7 +454,8 @@
 }
 
 void av1_set_mb_ur_variance(AV1_COMP *cpi) {
-  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -482,8 +469,21 @@
   const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
   const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
 
-  double a = -23.06 * 4.0, b = 0.004065, c = 30.516 * 4.0;
-  int delta_q_avg = 0;
+  int *mb_delta_q[2];
+  CHECK_MEM_ERROR(cm, mb_delta_q[0],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+  CHECK_MEM_ERROR(cm, mb_delta_q[1],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+  // Approximates the model change between current version (Spet 2021) and the
+  // baseline (July 2021).
+  const double model_change[] = { 3.0, 3.0 };
+  // The following parameters are fitted from user labeled data.
+  const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+  const double b[] = { 0.004898, 0.003093 };
+  const double c[] = { (29.932 + model_change[0]) * 4.0,
+                       (42.100 + model_change[1]) * 4.0 };
+  int delta_q_avg[2] = { 0, 0 };
   // Loop through each SB block.
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
@@ -504,7 +504,7 @@
           buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
           buf.stride = y_stride;
 
-          double block_variance;
+          unsigned int block_variance;
           if (use_hbd) {
             block_variance = av1_high_get_sby_perpixel_variance(
                 cpi, &buf, BLOCK_8X8, xd->bd);
@@ -513,25 +513,57 @@
                 av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
           }
 
-          block_variance = block_variance < 1.0 ? 1.0 : block_variance;
-          var += log(block_variance);
+          block_variance = AOMMAX(block_variance, 1);
+          var += log((double)block_variance);
           num_of_var += 1.0;
         }
       }
       var = exp(var / num_of_var);
-      cpi->mb_delta_q[index] = (int)(a * exp(-b * var) + c + 0.5);
-      delta_q_avg += cpi->mb_delta_q[index];
+      mb_delta_q[0][index] = (int)(a[0] * exp(-b[0] * var) + c[0] + 0.5);
+      mb_delta_q[1][index] = (int)(a[1] * exp(-b[1] * var) + c[1] + 0.5);
+      delta_q_avg[0] += mb_delta_q[0][index];
+      delta_q_avg[1] += mb_delta_q[1][index];
     }
   }
 
-  delta_q_avg = (int)((double)delta_q_avg / (num_rows * num_cols) + 0.5);
+  delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+  delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
 
+  int model_idx;
+  double scaling_factor;
+  const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+  if (cq_level < delta_q_avg[0]) {
+    model_idx = 0;
+    scaling_factor = (double)cq_level / delta_q_avg[0];
+  } else if (cq_level < delta_q_avg[1]) {
+    model_idx = 2;
+    scaling_factor =
+        (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+  } else {
+    model_idx = 1;
+    scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+  }
+
+  const double new_delta_q_avg =
+      delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
   for (int row = 0; row < num_rows; ++row) {
     for (int col = 0; col < num_cols; ++col) {
       const int index = row * num_cols + col;
-      cpi->mb_delta_q[index] -= delta_q_avg;
+      if (model_idx == 2) {
+        const double delta_q =
+            mb_delta_q[0][index] +
+            scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+        cpi->mb_delta_q[index] = RINT(delta_q - new_delta_q_avg);
+      } else {
+        cpi->mb_delta_q[index] =
+            RINT(scaling_factor *
+                 (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+      }
     }
   }
+
+  aom_free(mb_delta_q[0]);
+  aom_free(mb_delta_q[1]);
 }
 
 int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {

diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 3573b2a..37bc309 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c

@@ -46,11 +46,11 @@
 
 static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-         refresh_frame_flags->alt_ref_frame ||
-         (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref);
+         refresh_frame->alt_ref_frame ||
+         (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
 // Segmentation only makes sense if the target bits per SB is above a threshold.

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 04df183..cc97802 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c

@@ -21,7 +21,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
-  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
   if (cr == NULL) return NULL;
 
@@ -30,21 +29,12 @@
     av1_cyclic_refresh_free(cr);
     return NULL;
   }
-  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
-  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
-  if (cr->last_coded_q_map == NULL) {
-    av1_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  assert(MAXQ <= 255);
-  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   return cr;
 }
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   if (cr != NULL) {
     aom_free(cr->map);
-    aom_free(cr->last_coded_q_map);
     aom_free(cr);
   }
 }
@@ -155,6 +145,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   const int prev_segment_id = mbmi->segment_id;
   mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   if (prev_segment_id != mbmi->segment_id) {
@@ -164,8 +155,8 @@
     const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
     const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
     const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
-    for (int mi_y = 0; mi_y < ymis; mi_y++) {
-      for (int mi_x = 0; mi_x < xmis; mi_x++) {
+    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+      for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
         const int map_offset =
             block_index + mi_y * cm->mi_params.mi_cols + mi_x;
         cr->map[map_offset] = 0;
@@ -200,6 +191,7 @@
   const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
   const int refresh_this_block =
       candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
 
@@ -229,8 +221,8 @@
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  for (int mi_y = 0; mi_y < ymis; mi_y++) {
-    for (int mi_x = 0; mi_x < xmis; mi_x++) {
+  for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+    for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
       const int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
       cr->map[map_offset] = new_map_value;
       cpi->enc_seg.map[map_offset] = mbmi->segment_id;
@@ -276,7 +268,9 @@
        !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
     rc->avg_frame_low_motion =
-        (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
+        (rc->avg_frame_low_motion == 0)
+            ? avg_cnt_zeromv
+            : (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
     // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
     // to all lower spatial layers.
     if (cpi->ppi->use_svc &&
@@ -299,11 +293,16 @@
   // Set minimum gf_interval for GF update to a multiple of the refresh period,
   // with some max limit. Depending on past encoding stats, GF flag may be
   // reset and update may not occur until next baseline_gf_interval.
+  const int gf_length_mult[2] = { 8, 4 };
   if (cr->percent_refresh > 0)
-    p_rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
+    p_rc->baseline_gf_interval =
+        AOMMIN(gf_length_mult[cpi->sf.rt_sf.gf_length_lvl] *
+                   (100 / cr->percent_refresh),
+               MAX_GF_INTERVAL_RT);
   else
-    p_rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 8;
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL_RT;
+  if (rc->avg_frame_low_motion && rc->avg_frame_low_motion < 40)
+    p_rc->baseline_gf_interval = 16;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -342,13 +341,6 @@
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * cm->seq_params->mib_size;
     int mi_col = sb_col_index * cm->seq_params->mib_size;
-    // TODO(any): Ensure the population of
-    // cpi->common.features.allow_screen_content_tools and use the same instead
-    // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-    int qindex_thresh = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
-                                             cm->quant_params.base_qindex)
-                            : 0;
     assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
     assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
     bl_index = mi_row * mi_params->mi_cols + mi_col;
@@ -363,7 +355,7 @@
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map += 4;
+          sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -399,10 +391,21 @@
   double weight_segment = 0;
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
+  // Although this segment feature for RTC is only used for
+  // blocks >= 8X8, for more efficient coding of the seg map
+  // cur_frame->seg_map needs to set at 4x4 along with the
+  // function av1_cyclic_reset_segment_skip(). Skipping over
+  // 4x4 will therefore have small bdrate loss (~0.2%), so
+  // we use it only for speed > 9 for now.
+  // Also if loop-filter deltas is applied via segment, then
+  // we need to set cr->skip_over4x4 = 1.
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       cpi->svc.temporal_layer_id > 0 ||
       p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (cpi->svc.number_spatial_layers > 1 &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
       (rc->frames_since_key > 20 &&
        p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
       (rc->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
@@ -410,6 +413,7 @@
     return;
   }
   cr->percent_refresh = 10;
+  if (cpi->svc.number_temporal_layers > 2) cr->percent_refresh = 15;
   cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->motion_thresh = 32;
@@ -418,7 +422,8 @@
   // periods of the refresh cycle, after a key frame.
   // Account for larger interval on base layer for temporal layers.
   if (cr->percent_refresh > 0 &&
-      rc->frames_since_key < 400 / cr->percent_refresh) {
+      rc->frames_since_key <
+          (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
@@ -477,9 +482,6 @@
     memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_disable_segmentation(&cm->seg);
     if (cm->current_frame.frame_type == KEY_FRAME) {
-      memset(cr->last_coded_q_map, MAXQ,
-             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
-                 sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
     }
     return;

diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index 1c0d5cb..4e4e1f2 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h

@@ -80,10 +80,6 @@
    */
   int8_t *map;
   /*!
-   * Map of the last q a block was coded at.
-   */
-  uint8_t *last_coded_q_map;
-  /*!
    * Threshold applied to the projected rate of the coding block,
    * when deciding whether block should be refreshed.
    */
@@ -111,6 +107,7 @@
   int qindex_delta[3];
   double weight_segment;
   int apply_cyclic_refresh;
+  int skip_over4x4;
   /*!\endcond */
 };
 

diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index c2d0a07..3273ef8 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c

@@ -43,7 +43,7 @@
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *seg = &cm->seg;
   int i;
@@ -64,8 +64,8 @@
     return;
   }
   if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      refresh_frame_flags->alt_ref_frame ||
-      (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      refresh_frame->alt_ref_frame ||
+      (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     cpi->vaq_refresh = 1;
 
     av1_enable_segmentation(seg);
@@ -124,13 +124,13 @@
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
-                          16);
+                          16.0);
       } else {
         var +=
             log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
-                          16);
+                          16.0);
       }
     }
   }
@@ -141,6 +141,34 @@
   return (int)(var);
 }
 
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col) {
+  // This functions returns the block average of luma block
+  unsigned int sum, avg, num_pix;
+  int r, c;
+  const int pic_w = cpi->common.width;
+  const int pic_h = cpi->common.height;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+  sum = 0;
+  num_pix = 0;
+  avg = 0;
+  int row = mi_row << MI_SIZE_LOG2;
+  int col = mi_col << MI_SIZE_LOG2;
+  for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+    for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+      sum += *(x16 + r * x->plane[0].src.stride + c);
+      num_pix++;
+    }
+  }
+  if (num_pix != 0) {
+    avg = sum / num_pix;
+  }
+  return avg;
+}
+
 #define DEFAULT_E_MIDPOINT 10.0
 
 static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {

diff --git a/av1/encoder/aq_variance.h b/av1/encoder/aq_variance.h
index 543eb0b..aa0535a 100644
--- a/av1/encoder/aq_variance.h
+++ b/av1/encoder/aq_variance.h

@@ -21,6 +21,8 @@
 void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col);
 int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
                                                 int block_var_level);
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,

diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index bfa0a74..4419085 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c

@@ -53,12 +53,7 @@
 }
 
 static int enable_noise_estimation(AV1_COMP *const cpi) {
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
 
 #if CONFIG_AV1_HIGHBITDEPTH
   if (cpi->common.seq_params->use_highbitdepth) return 0;

diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 66be3b6..105897e 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c

@@ -764,14 +764,34 @@
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 }
 
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+  dqpCb = AOMMIN(0, dqpCb);
+  dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+  dqpCr = AOMMIN(0, dqpCr);
+  dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCr;
+}
+
 void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
-                       int q, int enable_chroma_deltaq) {
+                       int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
   CommonQuantParams *quant_params = &cm->quant_params;
   quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
-
   quant_params->y_dc_delta_q = 0;
+
   if (enable_chroma_deltaq) {
     // TODO(aomedia:2717): need to design better delta
     quant_params->u_dc_delta_q = 2;
@@ -785,6 +805,18 @@
     quant_params->v_ac_delta_q = 0;
   }
 
+  // following section 8.3.2 in T-REC-H.Sup15 document
+  // to apply to AV1 qindex in the range of [0, 255]
+  if (enable_hdr_deltaq) {
+    int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+    int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+    quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+    quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+    if (dqpCb != dqpCr) {
+      cm->seq_params->separate_uv_delta_q = 1;
+    }
+  }
+
   quant_params->qmatrix_level_y =
       aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
   quant_params->qmatrix_level_u =

diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index 62b53b8..085ab17 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h

@@ -106,7 +106,8 @@
                         aom_bit_depth_t bit_depth);
 
 void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
-                       int max_qmlevel, int q, int enable_chroma_deltaq);
+                       int max_qmlevel, int q, int enable_chroma_deltaq,
+                       int enable_hdr_deltaq);
 
 int av1_quantizer_to_qindex(int quantizer);
 

diff --git a/av1/encoder/av1_temporal_denoiser.c b/av1/encoder/av1_temporal_denoiser.c
index 96f3d7d..26e0eda7 100644
--- a/av1/encoder/av1_temporal_denoiser.c
+++ b/av1/encoder/av1_temporal_denoiser.c

@@ -718,11 +718,7 @@
                                 ? KEY_FRAME
                                 : cm->current_frame.frame_type;
     cpi->denoiser.current_denoiser_frame++;
-    const int resize_pending =
-        (cpi->resize_pending_params.width &&
-         cpi->resize_pending_params.height &&
-         (cpi->common.width != cpi->resize_pending_params.width ||
-          cpi->common.height != cpi->resize_pending_params.height));
+    const int resize_pending = is_frame_resize_pending(cpi);
 
     if (cpi->ppi->use_svc) {
 // TODO(kyslov) Enable when SVC temporal denosing is implemented

diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 3198c80..9696859 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h

@@ -252,8 +252,8 @@
  */
 typedef struct {
   //! Circular buffer that stores the txfm search results.
-  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
-  //! Index to insert the newest \ref TXB_RD_INFO.
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];
+  //! Index to insert the newest rd record.
   int index_start;
   //! Number of info stored in this record.
   int num;
@@ -261,44 +261,6 @@
   CRC32C crc_calculator;
 } MB_RD_RECORD;
 
-/*! \brief Txfm search results for a tx block.
- */
-typedef struct {
-  //! Distortion after the txfm process
-  int64_t dist;
-  //! SSE of the prediction before the txfm process
-  int64_t sse;
-  //! Rate used to encode the txfm.
-  int rate;
-  //! Location of the end of non-zero entries.
-  uint16_t eob;
-  //! Transform type used on the current block.
-  TX_TYPE tx_type;
-  //! Unknown usage
-  uint16_t entropy_context;
-  //! Context used to code the coefficients.
-  uint8_t txb_entropy_ctx;
-  //! Whether the current info block contains  valid info
-  uint8_t valid;
-  //! Unused
-  uint8_t fast;
-  //! Whether trellis optimization is done.
-  uint8_t perform_block_coeff_opt;
-} TXB_RD_INFO;
-
-/*! \brief Hash records of txfm search result for each tx block.
- */
-typedef struct {
-  //! The hash values.
-  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  //! The txfm search results
-  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  //! Index to insert the newest \ref TXB_RD_INFO.
-  int index_start;
-  //! Number of info stored in this record.
-  int num;
-} TXB_RD_RECORD;
-
 //! Number of compound rd stats
 #define MAX_COMP_RD_STATS 64
 /*! \brief Rdcost stats in compound mode.
@@ -499,16 +461,6 @@
   //! Txfm hash record for the whole coding block.
   MB_RD_RECORD mb_rd_record;
 
-  //! Inter mode txfm hash record for TX_8X8 blocks.
-  TXB_RD_RECORD txb_rd_record_8X8[MAX_NUM_8X8_TXBS];
-  //! Inter mode txfm hash record for TX_16X16 blocks.
-  TXB_RD_RECORD txb_rd_record_16X16[MAX_NUM_16X16_TXBS];
-  //! Inter mode txfm hash record for TX_32X32 blocks.
-  TXB_RD_RECORD txb_rd_record_32X32[MAX_NUM_32X32_TXBS];
-  //! Inter mode txfm hash record for TX_64X64 blocks.
-  TXB_RD_RECORD txb_rd_record_64X64[MAX_NUM_64X64_TXBS];
-  //! Intra mode txfm hash record for square tx blocks.
-  TXB_RD_RECORD txb_rd_record_intra;
   /**@}*/
 } TxbRdRecords;
 

diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index b38bd3e..fe2b9ae 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c

@@ -40,17 +40,18 @@
 #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
 
 static INLINE void set_refresh_frame_flags(
-    RefreshFrameFlagsInfo *const refresh_frame_flags, bool refresh_gf,
-    bool refresh_bwdref, bool refresh_arf) {
-  refresh_frame_flags->golden_frame = refresh_gf;
-  refresh_frame_flags->bwd_ref_frame = refresh_bwdref;
-  refresh_frame_flags->alt_ref_frame = refresh_arf;
+    RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+    bool refresh_arf) {
+  refresh_frame->golden_frame = refresh_gf;
+  refresh_frame->bwd_ref_frame = refresh_bwdref;
+  refresh_frame->alt_ref_frame = refresh_arf;
 }
 
-void av1_configure_buffer_updates(
-    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, const REFBUF_STATE refbuf_state,
-    int force_refresh_all) {
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all) {
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
   const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
@@ -59,22 +60,22 @@
 
   switch (type) {
     case KF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+      set_refresh_frame_flags(refresh_frame, true, true, true);
       break;
 
     case LF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       break;
 
     case GF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, true, false, false);
+      set_refresh_frame_flags(refresh_frame, true, false, false);
       break;
 
     case OVERLAY_UPDATE:
       if (refbuf_state == REFBUF_RESET)
-        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+        set_refresh_frame_flags(refresh_frame, true, true, true);
       else
-        set_refresh_frame_flags(refresh_frame_flags, true, false, false);
+        set_refresh_frame_flags(refresh_frame, true, false, false);
 
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
@@ -82,19 +83,19 @@
     case ARF_UPDATE:
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
       if (refbuf_state == REFBUF_RESET)
-        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+        set_refresh_frame_flags(refresh_frame, true, true, true);
       else
-        set_refresh_frame_flags(refresh_frame_flags, false, false, true);
+        set_refresh_frame_flags(refresh_frame, false, false, true);
 
       break;
 
     case INTNL_OVERLAY_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case INTNL_ARF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, true, false);
+      set_refresh_frame_flags(refresh_frame, false, true, false);
       break;
 
     default: assert(0); break;
@@ -102,7 +103,7 @@
 
   if (ext_refresh_frame_flags->update_pending &&
       (!is_stat_generation_stage(cpi))) {
-    set_refresh_frame_flags(refresh_frame_flags,
+    set_refresh_frame_flags(refresh_frame,
                             ext_refresh_frame_flags->golden_frame,
                             ext_refresh_frame_flags->bwd_ref_frame,
                             ext_refresh_frame_flags->alt_ref_frame);
@@ -116,7 +117,7 @@
   }
 
   if (force_refresh_all)
-    set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+    set_refresh_frame_flags(refresh_frame, true, true, true);
 }
 
 static void set_additional_frame_flags(const AV1_COMMON *const cm,
@@ -222,8 +223,12 @@
 
   if (this_duration) {
     if (step) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->new_framerate = 10000000.0 / this_duration;
+#endif
       av1_new_framerate(cpi, 10000000.0 / this_duration);
     } else {
+      double framerate;
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
@@ -232,10 +237,21 @@
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
-
-      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->new_framerate = (10000000.0 / avg_duration);
+      // For parallel frames update cpi->framerate with new_framerate
+      // during av1_post_encode_updates()
+      framerate =
+          (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+              ? cpi->framerate
+              : cpi->new_framerate;
+#else
+      framerate = (10000000.0 / avg_duration);
+#endif
+      av1_new_framerate(cpi, framerate);
     }
   }
+
   time_stamps->prev_ts_start = ts_start;
   time_stamps->prev_ts_end = ts_end;
 }
@@ -362,10 +378,9 @@
 
 // Update frame_flags to tell the encoder's caller what sort of frame was
 // encoded.
-static void update_frame_flags(
-    const AV1_COMMON *const cm,
-    const RefreshFrameFlagsInfo *const refresh_frame_flags,
-    unsigned int *frame_flags) {
+static void update_frame_flags(const AV1_COMMON *const cm,
+                               const RefreshFrameInfo *const refresh_frame,
+                               unsigned int *frame_flags) {
   if (encode_show_existing_frame(cm)) {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
@@ -374,19 +389,19 @@
     return;
   }
 
-  if (refresh_frame_flags->golden_frame) {
+  if (refresh_frame->golden_frame) {
     *frame_flags |= FRAMEFLAGS_GOLDEN;
   } else {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
   }
 
-  if (refresh_frame_flags->alt_ref_frame) {
+  if (refresh_frame->alt_ref_frame) {
     *frame_flags |= FRAMEFLAGS_ALTREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
   }
 
-  if (refresh_frame_flags->bwd_ref_frame) {
+  if (refresh_frame->bwd_ref_frame) {
     *frame_flags |= FRAMEFLAGS_BWDREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
@@ -969,9 +984,9 @@
         gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_UPDATE)
       is_forward_keyframe = 1;
 
-    const int code_arf =
-        av1_temporal_filter(cpi, arf_src_index, update_type,
-                            is_forward_keyframe, &show_existing_alt_ref);
+    const int code_arf = av1_temporal_filter(
+        cpi, arf_src_index, update_type, is_forward_keyframe,
+        &show_existing_alt_ref, &cpi->ppi->alt_ref_buffer);
     if (code_arf) {
       aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer, av1_num_planes(cm));
       frame_input->source = &cpi->ppi->alt_ref_buffer;
@@ -1229,9 +1244,7 @@
         gf_group->frame_parallel_level[gf_index - 1] == 1 &&
         gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
       assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
-      // TODO(Remya): Use original value of is_parallel_encode when FPMT is
-      // enabled.
-      is_parallel_encode = 0;
+
       // If parallel cpis are active, use ref_idx_to_skip, else, use display
       // index.
       assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
@@ -1827,6 +1840,9 @@
   }
 #endif  // CONFIG_REALTIME_ONLY
 
+  // As the frame_update_type can get modified as part of
+  // av1_adjust_gf_refresh_qp_one_pass_rt
+  frame_update_type = get_frame_update_type(gf_group, cpi->gf_frame_index);
   if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags

diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index 0ae4c05..15681c3 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h

@@ -62,10 +62,11 @@
 // Set individual buffer update flags based on frame reference type.
 // force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
 // refresh_*_frame flags to be set, because we refresh all buffers in this case.
-void av1_configure_buffer_updates(
-    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, const REFBUF_STATE refbuf_state,
-    int force_refresh_all);
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all);
 
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index 04e7e5e..ff9b8e0 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -241,6 +241,8 @@
     current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
   } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
     current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+    current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
   }
 
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -565,13 +567,8 @@
   }
 
 #if !CONFIG_REALTIME_ONLY
-  if (has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
-      cpi->oxcf.gf_cfg.lag_in_frames == 0) {
-    (void)tile_info;
-    (void)mi_row;
-    (void)mi_col;
-    (void)gather_tpl_data;
-  } else {
+  if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+        cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
     init_ref_frame_space(cpi, td, mi_row, mi_col);
     x->sb_energy_level = 0;
     x->part_search_info.cnn_output_valid = 0;
@@ -594,8 +591,7 @@
   (void)gather_tpl_data;
 #endif
 
-  // Reset hash state for transform/mode rd hash information
-  reset_hash_records(&x->txfm_search_info, cpi->sf.tx_sf.use_inter_txb_hash);
+  reset_hash_records(&x->txfm_search_info);
   av1_zero(x->picked_ref_frames_mask);
   av1_invalid_rd_stats(rd_cost);
 }
@@ -1367,7 +1363,8 @@
       cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
     else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
       cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
-
+    else if (deltaq_mode == DELTA_Q_HDR)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
     // Set delta_q_present_flag before it is used for the first time
     cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
     cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;

diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index 89c2322..9fbd68c 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c

@@ -15,6 +15,7 @@
 #include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
+#include "av1/encoder/aq_variance.h"
 
 void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
                          const BLOCK_SIZE bsize, const int mi_row,
@@ -957,6 +958,49 @@
 
   return qindex;
 }
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0,   301, 367, 434, 501, 567,
+                                                  634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3,  2,  1,  0,  -1,
+                                                    -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+  (void)x;
+  (void)bsize;
+  (void)mi_row;
+  (void)mi_col;
+  return cm->quant_params.base_qindex;
+#else
+  // calculate pixel average
+  const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+  // adjust offset based on average of the pixel block
+  int offset = 0;
+  for (int i = 0; i < HDR_QP_LEVELS; i++) {
+    if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+      offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+      break;
+    }
+  }
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+#endif
+}
 #endif  // !CONFIG_REALTIME_ONLY
 
 void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,

diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index 94abc42..3604616 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h

@@ -337,6 +337,9 @@
 int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                    int mi_row, int mi_col);
 
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col);
+
 int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
                             const BLOCK_SIZE bsize, const int mi_row,
                             const int mi_col, int orig_rdmult);

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 6b40ea0..4b657c8 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -682,7 +682,7 @@
   MACROBLOCK *const x = &cpi->td.mb;
   AV1LevelParams *const level_params = &cpi->ppi->level_params;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
@@ -735,8 +735,8 @@
     p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  refresh_frame_flags->golden_frame = false;
-  refresh_frame_flags->bwd_ref_frame = false;
+  refresh_frame->golden_frame = false;
+  refresh_frame->bwd_ref_frame = false;
 
   cm->features.refresh_frame_context =
       (oxcf->tool_cfg.frame_parallel_decoding_mode)
@@ -760,12 +760,17 @@
         aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
     x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
   }
-  for (int i = 0; i < 2; ++i) {
-    if (x->tmp_pred_bufs[i] == NULL) {
-      CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
-                      aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                           sizeof(*x->tmp_pred_bufs[i])));
-      x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+  // The buffers 'tmp_pred_bufs[]' are used in inter frames to store temporary
+  // prediction results. Hence, the memory allocation is avoided for allintra
+  // encode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    for (int i = 0; i < 2; ++i) {
+      if (x->tmp_pred_bufs[i] == NULL) {
+        CHECK_MEM_ERROR(cm, x->tmp_pred_bufs[i],
+                        aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                             sizeof(*x->tmp_pred_bufs[i])));
+        x->e_mbd.tmp_obmc_bufs[i] = x->tmp_pred_bufs[i];
+      }
     }
   }
 
@@ -2074,10 +2079,7 @@
   if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
 
 #if !CONFIG_REALTIME_ONLY
-  const int use_restoration = cm->seq_params->enable_restoration &&
-                              !cm->features.all_lossless &&
-                              !cm->tiles.large_scale;
-  if (use_restoration) {
+  if (is_restoration_used(cm)) {
     const int frame_width = cm->superres_upscaled_width;
     const int frame_height = cm->superres_upscaled_height;
     set_restoration_unit_size(frame_width, frame_height,
@@ -2209,9 +2211,7 @@
       !cm->features.coded_lossless && !cm->tiles.large_scale;
   const int use_cdef = cm->seq_params->enable_cdef &&
                        !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_restoration = cm->seq_params->enable_restoration &&
-                              !cm->features.all_lossless &&
-                              !cm->tiles.large_scale;
+  const int use_restoration = is_restoration_used(cm);
   const int cur_width = cm->cur_frame->width;
   const int cur_height = cm->cur_frame->height;
   const int cur_width_mib = cm->mi_params.mi_cols * MI_SIZE;
@@ -2261,12 +2261,7 @@
   AV1_COMMON *const cm = &cpi->common;
   const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
   SVC *const svc = &cpi->svc;
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
 
   int top_index = 0, bottom_index = 0, q = 0;
   YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
@@ -2309,7 +2304,9 @@
   if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
     variance_partition_alloc(cpi);
 
-  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      ((sf->inter_sf.extra_prune_warped && cpi->refresh_frame.golden_frame)))
+    copy_frame_prob_info(cpi);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   printf("\n Encoding a frame:");
@@ -2379,7 +2376,7 @@
   }
 
   av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                    q_cfg->enable_chroma_deltaq);
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
   av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
   if ((q_cfg->deltaq_mode != NO_DELTA_Q) || q_cfg->enable_chroma_deltaq)
     av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
@@ -2394,7 +2391,7 @@
       cpi->rc.high_source_sad) {
     if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
       av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                        q_cfg->enable_chroma_deltaq);
+                        q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
       av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
       if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
         av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
@@ -2438,6 +2435,13 @@
   if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
     av1_cyclic_refresh_postencode(cpi);
 
+  // Adjust the refresh of the golden (longer-term) reference based on QP
+  // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+  if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+      svc->number_temporal_layers == 1)
+    av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_encode_frame_time);
 #endif
@@ -2613,7 +2617,7 @@
     }
 #endif
     av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                      q_cfg->enable_chroma_deltaq);
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
     av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
 
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
@@ -2750,7 +2754,7 @@
 
 #if 0
       vbr_rc_info_log(&cpi->vbr_rc_info, cpi->gf_frame_index,
-                      cpi->ppi->gf_group.size, &cpi->ppi->gf_group.update_type);
+                      cpi->ppi->gf_group.size, cpi->ppi->gf_group.update_type);
 #endif
 
 #endif
@@ -3139,6 +3143,33 @@
   return err;
 }
 
+// Conditions to disable cdf_update mode in selective mode for real-time.
+// Handle case for layers, scene change, and resizing.
+static int selective_disable_cdf_rtc(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  // For single layer.
+  if (cpi->svc.number_spatial_layers == 1 &&
+      cpi->svc.number_temporal_layers == 1) {
+    // Don't disable on intra_only, scene change (high_source_sad = 1),
+    // or resized frame. Don't disable for some consecutive frames after
+    // key, or for some consecutive frames before the golden_refresh
+    // (cpi->rc.frames_till_gf_update_due < 6).
+    // To avoid quality loss for now, force enable at every x frames.
+    if (frame_is_intra_only(cm) || is_frame_resize_pending(cpi) ||
+        rc->high_source_sad || rc->frames_since_key < 10 ||
+        rc->frames_till_gf_update_due < 5 ||
+        cm->current_frame.frame_number % 10 == 0)
+      return 0;
+    else
+      return 1;
+  } else if (cpi->svc.number_temporal_layers > 1) {
+    // Disable only on top temporal enhancement layer for now.
+    return cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1;
+  }
+  return 1;
+}
+
 #if !CONFIG_REALTIME_ONLY
 static void subtract_stats(FIRSTPASS_STATS *section,
                            const FIRSTPASS_STATS *frame) {
@@ -3356,7 +3387,7 @@
   // Never drop on key frame.
   if (has_no_stats_stage(cpi) && oxcf->rc_cfg.mode == AOM_CBR &&
       current_frame->frame_type != KEY_FRAME) {
-    if (av1_rc_drop_frame(cpi)) {
+    if (cpi->oxcf.rc_cfg.target_bandwidth == 0 || av1_rc_drop_frame(cpi)) {
       av1_setup_frame_size(cpi);
       av1_rc_postencode_update_drop_frame(cpi);
       release_scaled_references(cpi);
@@ -3431,21 +3462,12 @@
     case 2:
       // Strategically determine at which frames to do CDF update.
       // Currently only enable CDF update for all-intra and no-show frames(1.5%
-      // compression loss).
-      // TODO(huisu@google.com): design schemes for various trade-offs between
-      // compression quality and decoding speed.
+      // compression loss) for good qualiy or allintra mode.
       if (oxcf->mode == GOOD || oxcf->mode == ALLINTRA) {
         features->disable_cdf_update =
             (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
       } else {
-        if (cpi->svc.number_spatial_layers == 1 &&
-            cpi->svc.number_temporal_layers == 1)
-          features->disable_cdf_update =
-              !((cm->current_frame.frame_number % 2) == 0);
-        else if (cpi->svc.number_temporal_layers > 1)
-          // Disable only on top temporal enhancement layer for now.
-          features->disable_cdf_update = (cpi->svc.temporal_layer_id ==
-                                          cpi->svc.number_temporal_layers - 1);
+        features->disable_cdf_update = selective_disable_cdf_rtc(cpi);
       }
       break;
   }
@@ -4552,12 +4574,18 @@
       AV1_COMP_DATA *cur_cpi_data =
           &ppi->parallel_frames_data[parallel_frame_count - 1];
       cur_cpi->gf_frame_index = i;
+      cur_cpi->framerate = first_cpi->framerate;
       cur_cpi->common.current_frame.frame_number = cur_frame_num;
       cur_cpi->frame_index_set.show_frame_count = show_frame_count;
       cur_cpi->rc.frames_since_key = frames_since_key;
       cur_cpi->rc.frames_to_key = frames_to_key;
       cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
       cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+      cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+      cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+      cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+      cur_cpi->rc.intervals_till_gf_calculate_due =
+          first_cpi->rc.intervals_till_gf_calculate_due;
       cur_cpi->mv_search_params.max_mv_magnitude =
           first_cpi->mv_search_params.max_mv_magnitude;
       if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 1bb2ace..26a66db 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -87,6 +87,14 @@
 // Lookahead index threshold to enable temporal filtering for second arf.
 #define TF_LOOKAHEAD_IDX_THR 7
 
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
 // Rational number with an int64 numerator
 // This structure holds a fractional value
 typedef struct aom_rational64 {
@@ -159,6 +167,7 @@
   DELTA_Q_PERCEPTUAL = 2,     // Modulation to improve video perceptual quality
   DELTA_Q_PERCEPTUAL_AI = 3,  // Perceptual quality opt for all intra mode
   DELTA_Q_USER_RATING_BASED = 4,  // User rating based delta q mode
+  DELTA_Q_HDR = 5,    // QP adjustment based on HDR block pixel average
   DELTA_Q_MODE_COUNT  // This should always be the last member of the enum
 } UENUM1BYTE(DELTAQ_MODE);
 
@@ -752,6 +761,8 @@
   DELTAQ_MODE deltaq_mode;
   // Indicates if delta quantization should be enabled in chroma planes.
   bool enable_chroma_deltaq;
+  // Indicates if delta quantization should be enabled for hdr video
+  bool enable_hdr_deltaq;
   // Indicates if encoding with quantization matrices should be enabled.
   bool using_qm;
 } QuantizationCfg;
@@ -1933,7 +1944,7 @@
   bool golden_frame;  /*!< Refresh flag for golden frame */
   bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
   bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
-} RefreshFrameFlagsInfo;
+} RefreshFrameInfo;
 
 /*!
  * \brief Desired dimensions for an externally triggered resize.
@@ -2139,7 +2150,6 @@
   int16_t rec_pix_max;
   int64_t distortion;
   int64_t satd;
-  double alpha;
   double max_scale;
 } WeberStats;
 
@@ -2682,7 +2692,7 @@
   /*!
    * Refresh frame flags for golden, bwd-ref and alt-ref frames.
    */
-  RefreshFrameFlagsInfo refresh_frame;
+  RefreshFrameInfo refresh_frame;
 
   /*!
    * Flags signalled by the external interface at frame level.
@@ -2904,6 +2914,13 @@
    * Retain condition for fast_extra_bits calculation.
    */
   int do_update_vbr_bits_off_target_fast;
+
+  /*!
+   * Updated framerate for the current parallel frame.
+   * cpi->framerate is updated with new_framerate during
+   * post encode updates for parallel frames.
+   */
+  double new_framerate;
 #endif
   /*!
    * Multi-threading parameters.
@@ -3238,7 +3255,7 @@
    *  Flags which determine which reference buffers are refreshed by this
    *  frame.
    */
-  RefreshFrameFlagsInfo refresh_frame;
+  RefreshFrameInfo refresh_frame;
 
   /*!
    *  Speed level to use for this frame: Bigger number means faster.
@@ -3439,7 +3456,7 @@
       // Once the keyframe is coded, the slots in ref_frame_map will all
       // point to the same frame. In that case, all subsequent pointers
       // matching the current are considered "free" slots. This will find
-      // the next occurance of the current pointer if ref_count indicates
+      // the next occurrence of the current pointer if ref_count indicates
       // there are multiple instances of it and mark it as free.
       for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
         const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
@@ -3453,30 +3470,6 @@
     ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
   }
 }
-
-static AOM_INLINE void calc_frame_data_update_flag(
-    GF_GROUP *const gf_group, int gf_frame_index,
-    bool *const do_frame_data_update) {
-  *do_frame_data_update = true;
-  // Set the flag to false for all frames in a given parallel encode set except
-  // the last frame in the set with frame_parallel_level = 2.
-  if (gf_group->frame_parallel_level[gf_frame_index] == 1) {
-    *do_frame_data_update = false;
-  } else if (gf_group->frame_parallel_level[gf_frame_index] == 2) {
-    // Check if this is the last frame in the set with frame_parallel_level = 2.
-    for (int i = gf_frame_index + 1; i < gf_group->size; i++) {
-      if ((gf_group->frame_parallel_level[i] == 0 &&
-           (gf_group->update_type[i] == ARF_UPDATE ||
-            gf_group->update_type[i] == INTNL_ARF_UPDATE)) ||
-          gf_group->frame_parallel_level[i] == 1) {
-        break;
-      } else if (gf_group->frame_parallel_level[i] == 2) {
-        *do_frame_data_update = false;
-        break;
-      }
-    }
-  }
-}
 #endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
 // TODO(jingning): Move these functions as primitive members for the new cpi
@@ -3842,6 +3835,20 @@
          cm->show_frame;
 }
 
+static INLINE int is_frame_resize_pending(AV1_COMP *const cpi) {
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  return (resize_pending_params->width && resize_pending_params->height &&
+          (cpi->common.width != resize_pending_params->width ||
+           cpi->common.height != resize_pending_params->height));
+}
+
+// Check if loop restoration filter is used.
+static INLINE int is_restoration_used(const AV1_COMMON *const cm) {
+  return cm->seq_params->enable_restoration && !cm->features.all_lossless &&
+         !cm->tiles.large_scale;
+}
+
 #if CONFIG_AV1_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
   return (!cpi->ppi->use_svc ||

diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index 01719c5..22406d9 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h

@@ -77,8 +77,11 @@
     aom_free(cpi->td.mb.mv_costs);
     cpi->td.mb.mv_costs = NULL;
   }
-  CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
-                  (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
+  // Avoid the memory allocation of 'mv_costs' for allintra encoding mode.
+  if (cpi->oxcf.kf_cfg.key_freq_max != 0) {
+    CHECK_MEM_ERROR(cm, cpi->td.mb.mv_costs,
+                    (MvCosts *)aom_calloc(1, sizeof(MvCosts)));
+  }
 
   if (cpi->td.mb.dv_costs) {
     aom_free(cpi->td.mb.dv_costs);
@@ -354,19 +357,10 @@
   AV1_COMMON *const cm = &cpi->common;
   const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
-  if (aom_realloc_frame_buffer(
-          &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
-          seq_params->subsampling_y, seq_params->use_highbitdepth,
-          cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, 0))
-    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
-                       "Failed to allocate last frame buffer");
 
   // The frame buffer trial_frame_rst is used during loop restoration filter
   // search. Hence it is allocated only when loop restoration is used.
-  const int use_restoration = cm->seq_params->enable_restoration &&
-                              !cm->features.all_lossless &&
-                              !cm->tiles.large_scale;
-  if (use_restoration) {
+  if (is_restoration_used(cm)) {
     if (aom_realloc_frame_buffer(
             &cpi->trial_frame_rst, cm->superres_upscaled_width,
             cm->superres_upscaled_height, seq_params->subsampling_x,

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index 633125d..66cd272 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c

@@ -1009,7 +1009,7 @@
     set_encoding_params_for_screen_content(cpi, pass);
     av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
                       q_for_screen_content_quick_run,
-                      q_cfg->enable_chroma_deltaq);
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
     av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 25ecee9..98638bc 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c

@@ -638,21 +638,23 @@
     }
 
 #if !CONFIG_REALTIME_ONLY
-    // Initialize loop restoration MT object.
-    AV1LrSync *lr_sync = &mt_info->lr_row_sync;
-    int rst_unit_size;
-    if (cm->width * cm->height > 352 * 288)
-      rst_unit_size = RESTORATION_UNITSIZE_MAX;
-    else
-      rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
-    int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
-    int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
-    if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
-        num_lr_workers > lr_sync->num_workers ||
-        MAX_MB_PLANE > lr_sync->num_planes) {
-      av1_loop_restoration_dealloc(lr_sync, num_lr_workers);
-      av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
-                                 MAX_MB_PLANE, cm->width);
+    if (is_restoration_used(cm)) {
+      // Initialize loop restoration MT object.
+      AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+      int rst_unit_size;
+      if (cm->width * cm->height > 352 * 288)
+        rst_unit_size = RESTORATION_UNITSIZE_MAX;
+      else
+        rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+      int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
+      int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+      if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+          num_lr_workers > lr_sync->num_workers ||
+          MAX_MB_PLANE > lr_sync->num_planes) {
+        av1_loop_restoration_dealloc(lr_sync, num_lr_workers);
+        av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+                                   MAX_MB_PLANE, cm->width);
+      }
     }
 #endif
 
@@ -742,11 +744,17 @@
         alloc_compound_type_rd_buffers(&ppi->error,
                                        &thread_data->td->comp_rd_buffer);
 
-        for (int j = 0; j < 2; ++j) {
-          AOM_CHECK_MEM_ERROR(
-              &ppi->error, thread_data->td->tmp_pred_bufs[j],
-              aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                   sizeof(*thread_data->td->tmp_pred_bufs[j])));
+        // The buffers 'tmp_pred_bufs[]' are used in inter frames to store
+        // temporary prediction results. Hence, the memory allocation is avoided
+        // for allintra encode.
+        if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) {
+          for (int j = 0; j < 2; ++j) {
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, thread_data->td->tmp_pred_bufs[j],
+                aom_memalign(32,
+                             2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                 sizeof(*thread_data->td->tmp_pred_bufs[j])));
+          }
         }
 
         const SPEED_FEATURES *sf = &ppi->cpi->sf;
@@ -828,7 +836,7 @@
 #if CONFIG_FRAME_PARALLEL_ENCODE
 // This function returns 1 if frame parallel encode is supported for
 // the current configuration. Returns 0 otherwise.
-static AOM_INLINE int is_fp_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
   // FPMT is enabled for AOM_Q and AOM_VBR.
   // TODO(Mufaddal, Aasaipriya): Test and enable multi-tile and resize config.
   if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
@@ -837,7 +845,7 @@
   if (ppi->use_svc) {
     return 0;
   }
-  if (oxcf->tile_cfg.tile_columns > 0 || oxcf->tile_cfg.tile_rows > 0) {
+  if (oxcf->tile_cfg.enable_large_scale_tile) {
     return 0;
   }
   if (oxcf->dec_model_cfg.timing_info_present) {
@@ -867,7 +875,7 @@
 // based on the number of max_enc_workers.
 int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
   ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
-  if (!is_fp_config(ppi, oxcf)) {
+  if (!is_fpmt_config(ppi, oxcf)) {
     return 1;
   }
   int max_num_enc_workers =
@@ -919,7 +927,8 @@
         &p_mt_info->workers[i];
     AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
     MultiThreadInfo *mt_info = &cur_cpi->mt_info;
-    const int num_planes = av1_num_planes(&cur_cpi->common);
+    AV1_COMMON *const cm = &cur_cpi->common;
+    const int num_planes = av1_num_planes(cm);
 
     // Assign start of level 2 worker pool
     mt_info->workers = &p_mt_info->workers[i];
@@ -941,24 +950,25 @@
             mt_info->cdef_worker->colbuf[plane];
     }
 #if !CONFIG_REALTIME_ONLY
-    // Back up the original LR buffers before update.
-    int idx = i + mt_info->num_workers - 1;
-    mt_info->restore_state_buf.rst_tmpbuf =
-        mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
-    mt_info->restore_state_buf.rlbs =
-        mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+    if (is_restoration_used(cm)) {
+      // Back up the original LR buffers before update.
+      int idx = i + mt_info->num_workers - 1;
+      mt_info->restore_state_buf.rst_tmpbuf =
+          mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+      mt_info->restore_state_buf.rlbs =
+          mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
 
-    // Update LR buffers.
-    mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
-        cur_cpi->common.rst_tmpbuf;
-    mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cur_cpi->common.rlbs;
+      // Update LR buffers.
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf = cm->rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cm->rlbs;
+    }
 #endif
 
     // At this stage, the thread specific CDEF buffers for the current frame's
     // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
     // already been allocated across parallel frames.
-    av1_alloc_cdef_buffers(&cur_cpi->common, &p_mt_info->cdef_worker,
-                           &mt_info->cdef_sync, p_mt_info->num_workers, 0);
+    av1_alloc_cdef_buffers(cm, &p_mt_info->cdef_worker, &mt_info->cdef_sync,
+                           p_mt_info->num_workers, 0);
 
     frame_worker->hook = hook;
     frame_worker->data1 = cur_cpi;
@@ -1021,7 +1031,8 @@
   while (i < num_workers) {
     AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
     MultiThreadInfo *mt_info = &cur_cpi->mt_info;
-    const int num_planes = av1_num_planes(&cur_cpi->common);
+    const AV1_COMMON *const cm = &cur_cpi->common;
+    const int num_planes = av1_num_planes(cm);
 
     // Restore the original cdef_worker pointers.
     if (ppi->p_mt_info.cdef_worker != NULL) {
@@ -1031,12 +1042,14 @@
             mt_info->restore_state_buf.cdef_colbuf[plane];
     }
 #if !CONFIG_REALTIME_ONLY
-    // Restore the original LR buffers.
-    int idx = i + mt_info->num_workers - 1;
-    mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
-        mt_info->restore_state_buf.rst_tmpbuf;
-    mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
-        mt_info->restore_state_buf.rlbs;
+    if (is_restoration_used(cm)) {
+      // Restore the original LR buffers.
+      int idx = i + mt_info->num_workers - 1;
+      mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+          mt_info->restore_state_buf.rst_tmpbuf;
+      mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+          mt_info->restore_state_buf.rlbs;
+    }
 #endif
 
     frame_idx++;

diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index bf73a60..bd75664 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h

@@ -100,8 +100,6 @@
 
 void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
 
-int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
-
 void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
                                 CdefSearchCtx *cdef_search_ctx);
 
@@ -117,6 +115,8 @@
 int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
 
 #if CONFIG_FRAME_PARALLEL_ENCODE
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
 int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
                                  AV1_COMP_DATA *const first_cpi_data);
 #endif

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 28d527a..af1e8c1 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c

@@ -1271,9 +1271,9 @@
   // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 
-  av1_set_quantizer(cm, cpi->oxcf.q_cfg.qm_minlevel,
-                    cpi->oxcf.q_cfg.qm_maxlevel, qindex,
-                    cpi->oxcf.q_cfg.enable_chroma_deltaq);
+  av1_set_quantizer(
+      cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+      cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
 
   av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);

diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index c9198b4..2e53c70 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c

@@ -61,10 +61,12 @@
 #if CONFIG_FRAME_PARALLEL_ENCODE_2
 // Sets the GF_GROUP params for LF_UPDATE frames.
 static AOM_INLINE void set_params_for_leaf_frames(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
     int *parallel_frame_count, int max_parallel_frames,
     int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
-    int layer_depth) {
+    int layer_depth, int start, int end) {
   gf_group->update_type[*frame_ind] = LF_UPDATE;
   gf_group->arf_src_offset[*frame_ind] = 0;
   gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
@@ -73,6 +75,9 @@
   gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
   gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
   gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                         end - start, 0, NULL, NULL, 0);
   ++(*cur_disp_index);
 
   // Set the level of parallelism for the LF_UPDATE frame.
@@ -108,10 +113,13 @@
 
 // Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
 static AOM_INLINE void set_params_for_internal_arfs(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
     int *parallel_frame_count, int max_parallel_frames,
     int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
-    int *cur_disp_idx, int layer_depth, int arf_src_offset) {
+    int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+    int f_frames, int b_frames) {
   gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
   gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
   gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
@@ -120,6 +128,9 @@
   gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
   gf_group->display_idx[*frame_ind] =
       (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+                         f_frames, b_frames, NULL, NULL, 0);
 
   if (do_frame_parallel_encode) {
     if (depth_thr != INT_MAX) {
@@ -162,12 +173,12 @@
 
 // Set parameters for frames between 'start' and 'end' (excluding both).
 static void set_multi_layer_params_for_fp(
-    const TWO_PASS *twopass, GF_GROUP *const gf_group,
-    const PRIMARY_RATE_CONTROL *p_rc, RATE_CONTROL *rc, FRAME_INFO *frame_info,
-    int start, int end, int *cur_frame_idx, int *frame_ind,
-    int *parallel_frame_count, int max_parallel_frames,
-    int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
-    int *cur_disp_idx, int layer_depth) {
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
   const int num_frames_to_process = end - start;
 
   // Either we are at the last level of the pyramid, or we don't have enough
@@ -176,10 +187,11 @@
       num_frames_to_process < 3) {
     // Leaf nodes.
     while (start < end) {
-      set_params_for_leaf_frames(gf_group, cur_frame_idx, frame_ind,
+      set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+                                 gf_group, cur_frame_idx, frame_ind,
                                  parallel_frame_count, max_parallel_frames,
                                  do_frame_parallel_encode, first_frame_index,
-                                 cur_disp_idx, layer_depth);
+                                 cur_disp_idx, layer_depth, start, end);
       ++start;
     }
   } else {
@@ -188,9 +200,10 @@
     // Internal ARF.
     int arf_src_offset = m - start;
     set_params_for_internal_arfs(
-        gf_group, cur_frame_idx, frame_ind, parallel_frame_count,
-        max_parallel_frames, do_frame_parallel_encode, first_frame_index,
-        INT_MAX, cur_disp_idx, layer_depth, arf_src_offset);
+        twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+        frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+        layer_depth, arf_src_offset, m, end - m, m - start);
 
     // If encode reordering is enabled, configure the multi-layers accordingly
     // and return. For e.g., the encode order for gf-interval 16 after
@@ -200,13 +213,19 @@
       int m1 = (m + start - 1) / 2;
       int m2 = (m + 1 + end) / 2;
       int arf_src_offsets[2] = { m1 - start, m2 - start };
+      // Parameters to compute arf_boost.
+      int offset[2] = { m1, m2 };
+      int f_frames[2] = { m - m1, end - m2 };
+      int b_frames[2] = { m1 - start, m2 - (m + 1) };
 
       // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
       for (int i = 0; i < 2; i++) {
         set_params_for_internal_arfs(
-            gf_group, cur_frame_idx, frame_ind, parallel_frame_count,
-            max_parallel_frames, do_frame_parallel_encode, first_frame_index,
-            depth_thr, cur_disp_idx, layer_depth + 1, arf_src_offsets[i]);
+            twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+            frame_ind, parallel_frame_count, max_parallel_frames,
+            do_frame_parallel_encode, first_frame_index, depth_thr,
+            cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+            f_frames[i], b_frames[i]);
       }
 
       // Initialize the start and end indices to configure LF_UPDATE frames.
@@ -219,10 +238,10 @@
       // frames after reordering.
       for (int i = 0; i < 4; i++) {
         set_multi_layer_params_for_fp(
-            twopass, gf_group, p_rc, rc, frame_info, start_idx[i], end_idx[i],
-            cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
-            do_frame_parallel_encode, first_frame_index, depth_thr,
-            cur_disp_idx, layer_depth + 2);
+            twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+            start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+            parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+            first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
         if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
           set_params_for_intnl_overlay_frames(
               gf_group, cur_frame_idx, frame_ind, first_frame_index,
@@ -232,11 +251,11 @@
     }
 
     // Frames displayed before this internal ARF.
-    set_multi_layer_params_for_fp(twopass, gf_group, p_rc, rc, frame_info,
-                                  start, m, cur_frame_idx, frame_ind,
-                                  parallel_frame_count, max_parallel_frames,
-                                  do_frame_parallel_encode, first_frame_index,
-                                  depth_thr, cur_disp_idx, layer_depth + 1);
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
 
     // Overlay for internal ARF.
     set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
@@ -244,11 +263,11 @@
                                         layer_depth);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params_for_fp(twopass, gf_group, p_rc, rc, frame_info,
-                                  m + 1, end, cur_frame_idx, frame_ind,
-                                  parallel_frame_count, max_parallel_frames,
-                                  do_frame_parallel_encode, first_frame_index,
-                                  depth_thr, cur_disp_idx, layer_depth + 1);
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
   }
 }
 
@@ -356,6 +375,8 @@
 // Configures multi-layers of the GF_GROUP when consecutive encode of frames in
 // the same layer depth is enbaled.
 static AOM_INLINE void set_multi_layer_params_for_gf14(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
     GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
     int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
     int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
@@ -391,9 +412,13 @@
     // LF_UPDATE frame.
     if (doh_gf_index_map[i] == INVALID_IDX) {
       // LF_UPDATE frames.
-      set_params_for_leaf_frames(gf_group, cur_frame_idx, frame_ind,
-                                 parallel_frame_count, max_parallel_frames, 1,
-                                 first_frame_index, cur_disp_index, layer);
+      // TODO(Remya): Correct start and end parameters passed to
+      // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+      // is enbaled for parallel encode of lower layer frames.
+      set_params_for_leaf_frames(
+          twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+          frame_ind, parallel_frame_count, max_parallel_frames, 1,
+          first_frame_index, cur_disp_index, layer, 0, 0);
     } else {
       // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
       // the gf index of corresponding INTNL_ARF_UPDATE frames.
@@ -529,7 +554,9 @@
 #if CONFIG_FRAME_PARALLEL_ENCODE
 #if CONFIG_FRAME_PARALLEL_ENCODE_2
   // Set the display order hint for the first frame in the GF_GROUP.
-  int cur_disp_index = cpi->common.current_frame.frame_number;
+  int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+                           ? 0
+                           : cpi->common.current_frame.frame_number;
 #endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
 #endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
@@ -686,9 +713,10 @@
       // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
       // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
       set_multi_layer_params_for_gf14(
-          gf_group, arf_frame_stats, &cur_frame_index, &frame_index,
-          &count_arf_frames, doh_gf_index_map, &parallel_frame_count,
-          &first_frame_index, &cur_disp_index, actual_gf_length, use_altref + 1,
+          twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+          arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+          doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+          &cur_disp_index, actual_gf_length, use_altref + 1,
           cpi->ppi->num_fp_contexts);
 
       // Set gf_group->skip_frame_refresh.
@@ -711,12 +739,12 @@
       int depth_thr =
           (actual_gf_length == 16) ? 3 : (actual_gf_length == 32) ? 4 : INT_MAX;
 
-      // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
       set_multi_layer_params_for_fp(
-          twopass, gf_group, p_rc, rc, frame_info, cur_frame_index, gf_interval,
-          &cur_frame_index, &frame_index, &parallel_frame_count,
-          cpi->ppi->num_fp_contexts, do_frame_parallel_encode,
-          &first_frame_index, depth_thr, &cur_disp_index, use_altref + 1);
+          twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+          cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+          &parallel_frame_count, cpi->ppi->num_fp_contexts,
+          do_frame_parallel_encode, &first_frame_index, depth_thr,
+          &cur_disp_index, use_altref + 1);
     }
     is_multi_layer_configured = 1;
   }

diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index 902b699..e25d44d 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h

@@ -149,6 +149,11 @@
    *    MACROBLOCK::pred_sse due to different interpolation filter used.
    */
   unsigned int best_single_sse_in_refs[REF_FRAMES];
+  /*!
+   * Holds the sse of best mode so far in the mode evaluation process. This is
+   * used in intermediate termination of NEWMV mode evaluation.
+   */
+  unsigned int best_pred_sse;
 } HandleInterModeArgs;
 
 /*!\cond */

diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 25a6e2c..4e86b77 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c

@@ -1118,49 +1118,49 @@
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
-                          16);
+                          16.0);
         rec_var += log(
             1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                       xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
                       xd->plane[0].dst.stride,
                       CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
-                      16);
+                      16.0);
       } else {
         src_var +=
             log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, all_zeros, 0, &sse) /
-                          16);
+                          16.0);
         rec_var += log(
             1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                       xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
                       xd->plane[0].dst.stride, all_zeros, 0, &sse) /
-                      16);
+                      16.0);
       }
     }
   }
   src_var /= (double)blocks;
   rec_var /= (double)blocks;
 
-  // Only take action when the spatial complexity is low
-  if ((rec_var < threshold) || (src_var < threshold)) {
-    // Dont allow 0 to prevent / 0 below.
-    src_var += 0.000001;
-    rec_var += 0.000001;
+  // Dont allow 0 to prevent / 0 below.
+  src_var += 0.000001;
+  rec_var += 0.000001;
 
-    // Heavier weigth if the reconstruction has lower variance.
-    if (src_var >= rec_var) {
-      var_diff = (src_var - rec_var) * 2;
-      variance_rd_factor = 1.0 + (var_diff / src_var);
-    } else {
-      var_diff = (rec_var - src_var) / 2;
-      variance_rd_factor = 1.0 + (var_diff / src_var);
+  if (src_var >= rec_var) {
+    var_diff = (src_var - rec_var);
+    if ((var_diff > 0.5) && (rec_var < threshold)) {
+      variance_rd_factor = 1.0 + ((var_diff * 2) / src_var);
     }
-
-    // Limit adjustment;
-    variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+  } else {
+    var_diff = (rec_var - src_var);
+    if ((var_diff > 0.5) && (src_var < threshold)) {
+      variance_rd_factor = 1.0 + (var_diff / (2 * src_var));
+    }
   }
 
+  // Limit adjustment;
+  variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+
   return variance_rd_factor;
 }
 

diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 1a53c23..ef7cb1f 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c

@@ -41,9 +41,14 @@
   mv_cost_params->mv_cost_type = MV_COST_ENTROPY;
   mv_cost_params->error_per_bit = errorperbit;
   mv_cost_params->sad_per_bit = sadperbit;
-  mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
-  mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
-  mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+  // For allintra encoding mode, 'mv_costs' is not allocated. Hence, the
+  // population of mvjcost and mvcost are avoided. In case of IntraBC, these
+  // values are populated from 'dv_costs' in av1_set_ms_to_intra_mode().
+  if (mv_costs != NULL) {
+    mv_cost_params->mvjcost = mv_costs->nmv_joint_cost;
+    mv_cost_params->mvcost[0] = mv_costs->mv_cost_stack[0];
+    mv_cost_params->mvcost[1] = mv_costs->mv_cost_stack[1];
+  }
 }
 
 static INLINE void init_ms_buffers(MSBuffers *ms_buffers, const MACROBLOCK *x) {

diff --git a/av1/encoder/mv_prec.h b/av1/encoder/mv_prec.h
index 11dcdd8..55108b6 100644
--- a/av1/encoder/mv_prec.h
+++ b/av1/encoder/mv_prec.h

@@ -33,6 +33,9 @@
     AV1_COMP *cpi, int allow_high_precision_mv,
     int cur_frame_force_integer_mv) {
   MvCosts *const mv_costs = cpi->td.mb.mv_costs;
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
   const int copy_hp = cpi->common.features.allow_high_precision_mv =
       allow_high_precision_mv && !cur_frame_force_integer_mv;
 

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index 3d710dd..5ee5761 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c

@@ -757,13 +757,14 @@
   int eob_cost = 0;
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
+  const int use_hbd = is_cur_buf_hbd(xd);
 
   (void)mi_row;
   (void)mi_col;
   (void)cpi;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (use_hbd) {
     aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
                               p->src.stride, pd->dst.buf, pd->dst.stride,
                               x->e_mbd.bd);
@@ -784,14 +785,15 @@
       if (c < max_blocks_wide) {
         const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
         const int block_offset = BLOCK_OFFSET(block);
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
 #if CONFIG_AV1_HIGHBITDEPTH
         tran_low_t *const coeff = p->coeff + block_offset;
         tran_low_t *const qcoeff = p->qcoeff + block_offset;
         tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #else
-        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
-        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
+        (void)use_hbd;
 #endif
         uint16_t *const eob = &p->eobs[block];
         const int diff_stride = bw;
@@ -805,28 +807,53 @@
           case TX_32X32:
             assert(0);  // Not used
             break;
+
 #if CONFIG_AV1_HIGHBITDEPTH
           case TX_16X16:
-            aom_hadamard_16x16(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_16x16(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+              av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
           case TX_8X8:
-            aom_hadamard_8x8(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_8x8(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+              av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
           default:
             assert(tx_size == TX_4X4);
-            aom_fdct4x4(src_diff, coeff, diff_stride);
-            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_fdct4x4(src_diff, coeff, diff_stride);
+              av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+              av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
 #else
           case TX_16X16:
@@ -876,18 +903,32 @@
         const int block_offset = BLOCK_OFFSET(block);
         uint16_t *const eob = &p->eobs[block];
 #if CONFIG_AV1_HIGHBITDEPTH
-        int64_t dummy;
-        tran_low_t *const coeff = p->coeff + block_offset;
-        tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+        if (use_hbd) {
+          int64_t dummy;
+          tran_low_t *const coeff = p->coeff + block_offset;
+          tran_low_t *const qcoeff = p->qcoeff + block_offset;
+          tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 
-        if (*eob == 1)
-          this_rdc->rate += (int)abs(qcoeff[0]);
-        else if (*eob > 1)
-          this_rdc->rate += aom_satd(qcoeff, step << 4);
+          if (*eob == 1)
+            this_rdc->rate += (int)abs(qcoeff[0]);
+          else if (*eob > 1)
+            this_rdc->rate += aom_satd(qcoeff, step << 4);
 
-        this_rdc->dist +=
-            av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+          this_rdc->dist +=
+              av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+        } else {
+          int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+          int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+          int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
+
+          if (*eob == 1)
+            this_rdc->rate += (int)abs(low_qcoeff[0]);
+          else if (*eob > 1)
+            this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+          this_rdc->dist +=
+              av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+        }
 #else
         int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
         int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
@@ -2367,11 +2408,7 @@
   AV1_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
   int denoise_svc_pickmode = 1;
-  const int resize_pending =
-      (cpi->resize_pending_params.width && cpi->resize_pending_params.height &&
-       (cpi->common.width != cpi->resize_pending_params.width ||
-        cpi->common.height != cpi->resize_pending_params.height));
-
+  const int resize_pending = is_frame_resize_pending(cpi);
 #endif
   x->color_sensitivity[0] = x->color_sensitivity_sb[0];
   x->color_sensitivity[1] = x->color_sensitivity_sb[1];
@@ -2728,8 +2765,11 @@
             mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_SMOOTH);
         }
       }
-
-      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+      if (!comp_pred)
+        av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      else
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      0);
 
       if (use_model_yrd_large) {
         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,

diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 7582517..001c4c1 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c

@@ -1660,11 +1660,11 @@
       part_sf->adjust_var_based_rd_partitioning > 3)
     return 0;
 
-  const int is_larger_qindex = cm->quant_params.base_qindex > 190;
   if (part_sf->adjust_var_based_rd_partitioning == 1) {
-    return !frame_is_intra_only(cm) && is_larger_qindex && bsize <= BLOCK_32X32;
+    return bsize <= BLOCK_32X32;
   } else {
     if (bsize <= BLOCK_32X32) return 1;
+    const int is_larger_qindex = cm->quant_params.base_qindex > 190;
     if (part_sf->adjust_var_based_rd_partitioning == 2) {
       const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
       return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
@@ -2083,7 +2083,8 @@
     }
     if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
   }
-  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm)
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+      !cpi->cyclic_refresh->skip_over4x4)
     av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   // This function will copy the best mode information from block
@@ -3939,6 +3940,7 @@
   fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
   assert(read_bsize == cpi->common.seq_params->sb_size);
   BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+  assert(bsize == pc_tree->block_size);
 
   PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
   int last_idx = 1;
@@ -3950,7 +3952,10 @@
     assert(partitioning >= PARTITION_NONE &&
            partitioning < EXT_PARTITION_TYPES);
     PC_TREE *node = tree_node_queue[q_idx];
-    if (node != NULL) node->partitioning = partitioning;
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
     if (partitioning == PARTITION_SPLIT) {
       const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       for (int i = 0; i < 4; ++i) {
@@ -3961,7 +3966,6 @@
           ++last_idx;
         }
       }
-      bsize = subsize;
     }
     --num_nodes;
     ++q_idx;
@@ -4102,10 +4106,10 @@
 }
 
 static void prepare_sb_features_before_search(
-    AV1_COMP *const cpi, ThreadData *td, int mi_row, int mi_col,
-    const BLOCK_SIZE bsize, aom_partition_features_t *features) {
-  av1_collect_motion_search_features_sb(cpi, td, mi_row, mi_col, bsize,
-                                        features);
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+    int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+  av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                        bsize, features);
   collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
 }
 
@@ -4130,7 +4134,10 @@
     assert(partitioning >= PARTITION_NONE &&
            partitioning < EXT_PARTITION_TYPES);
     PC_TREE *node = tree_node_queue[q_idx];
-    if (node != NULL) node->partitioning = partitioning;
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
     if (partitioning == PARTITION_SPLIT) {
       const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       for (int i = 0; i < 4; ++i) {
@@ -4141,7 +4148,6 @@
           ++last_idx;
         }
       }
-      bsize = subsize;
     }
     --num_nodes;
     ++q_idx;
@@ -4159,7 +4165,8 @@
   MACROBLOCK *const x = &td->mb;
   ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
   aom_partition_features_t features;
-  prepare_sb_features_before_search(cpi, td, mi_row, mi_col, bsize, &features);
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
   features.mi_row = mi_row;
   features.mi_col = mi_col;
   features.frame_width = cpi->frame_info.frame_width;
@@ -4201,6 +4208,63 @@
   return true;
 }
 
+// Use a bitmask to represent the valid partition types for the current
+// block. "1" represents the corresponding partition type is vaild.
+// The least significant bit represents "PARTITION_NONE", the
+// largest significant bit represents "PARTITION_VERT_4", follow
+// the enum order for PARTITION_TYPE in "enums.h"
+static int get_valid_partition_types(
+    const AV1_COMP *const cpi,
+    const PartitionSearchState *const part_search_state,
+    const BLOCK_SIZE bsize) {
+  const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
+  int valid_types = 0;
+  // PARTITION_NONE
+  valid_types |= (part_search_state->partition_none_allowed << 0);
+  // PARTITION_HORZ
+  valid_types |= (part_search_state->partition_rect_allowed[HORZ] << 1);
+  // PARTITION_VERT
+  valid_types |= (part_search_state->partition_rect_allowed[VERT] << 2);
+  // PARTITION_SPLIT
+  valid_types |= (part_search_state->do_square_split << 3);
+  // PARTITION_HORZ_A
+  const int ext_partition_allowed = part_search_state->do_rectangular_split &&
+                                    av1_blk_has_rows_and_cols(&blk_params);
+  const int horzab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[HORZ];
+  valid_types |= (horzab_partition_allowed << 4);
+  // PARTITION_HORZ_B
+  valid_types |= (horzab_partition_allowed << 5);
+  // PARTITION_VERT_A
+  const int vertab_partition_allowed =
+      ext_partition_allowed && part_cfg->enable_ab_partitions &&
+      part_search_state->partition_rect_allowed[VERT];
+  valid_types |= (vertab_partition_allowed << 6);
+  // PARTITION_VERT_B
+  valid_types |= (vertab_partition_allowed << 7);
+  // PARTITION_HORZ_4
+  const int partition4_allowed = part_cfg->enable_1to4_partitions &&
+                                 ext_partition_allowed &&
+                                 bsize != BLOCK_128X128;
+  const int horz4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (horz4_allowed << 8);
+  // PARTITION_VERT_4
+  const int vert4_allowed =
+      partition4_allowed && part_search_state->partition_rect_allowed[HORZ] &&
+      get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT_4),
+                           part_search_state->ss_x,
+                           part_search_state->ss_y) != BLOCK_INVALID;
+  valid_types |= (vert4_allowed << 9);
+
+  return valid_types;
+}
+
 static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
                                 TileDataEnc *tile_data, TokenExtra **tp,
                                 SIMPLE_MOTION_DATA_TREE *sms_root,
@@ -4209,24 +4273,13 @@
   const AV1_COMMON *const cm = &cpi->common;
   ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
   MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
     return false;
   }
   aom_partition_decision_t partition_decision;
   do {
-    aom_partition_features_t features;
-    features.mi_row = mi_row;
-    features.mi_col = mi_col;
-    features.frame_width = cpi->frame_info.frame_width;
-    features.frame_height = cpi->frame_info.frame_height;
-    features.block_size = bsize;
-    av1_ext_part_send_features(ext_part_controller, &features);
-    const bool valid_decision = av1_ext_part_get_partition_decision(
-        ext_part_controller, &partition_decision);
-    if (!valid_decision) return false;
-    pc_tree->partitioning = partition_decision.current_decision;
     PartitionSearchState part_search_state;
-
     // Initialization of state variables used in partition search.
     // TODO(chengchen): check if there is hidden conditions that don't allow
     // all possible partition types.
@@ -4237,6 +4290,37 @@
     PartitionBlkParams blk_params = part_search_state.part_blk_params;
     if (!av1_blk_has_rows_and_cols(&blk_params))
       set_partition_cost_for_edge_blk(cm, &part_search_state);
+    const int orig_rdmult = x->rdmult;
+    setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+    const int valid_partition_types =
+        get_valid_partition_types(cpi, &part_search_state, bsize);
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    const int qindex = av1_get_qindex(&cm->seg, xd->mi[0]->segment_id,
+                                      cm->quant_params.base_qindex);
+    // RD multiplier
+    const int rdmult = x->rdmult;
+    // pyramid level
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
+    x->rdmult = orig_rdmult;
+
+    aom_partition_features_t features;
+    features.mi_row = mi_row;
+    features.mi_col = mi_col;
+    features.frame_width = cpi->frame_info.frame_width;
+    features.frame_height = cpi->frame_info.frame_height;
+    features.block_size = bsize;
+    features.valid_partition_types = valid_partition_types;
+    features.update_type = update_type;
+    features.qindex = qindex;
+    features.rdmult = rdmult;
+    features.pyramid_level = pyramid_level;
+    av1_ext_part_send_features(ext_part_controller, &features);
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+    pc_tree->partitioning = partition_decision.current_decision;
 
     av1_init_rd_stats(this_rdcost);
     if (partition_decision.current_decision == PARTITION_SPLIT) {
@@ -4249,9 +4333,8 @@
           pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
         pc_tree->split[i]->index = i;
       }
-      const int orig_rdmult = x->rdmult;
+      const int orig_rdmult_tmp = x->rdmult;
       setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
-      (void)orig_rdmult;
       // TODO(chengchen): check boundary conditions
       // top-left
       recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
@@ -4276,7 +4359,7 @@
         this_rdcost->dist += split_rdc[i].dist;
         av1_rd_cost_update(x->rdmult, this_rdcost);
       }
-      x->rdmult = orig_rdmult;
+      x->rdmult = orig_rdmult_tmp;
     } else {
       *this_rdcost = rd_search_for_fixed_partition(
           cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
@@ -4311,7 +4394,8 @@
   MACROBLOCK *const x = &td->mb;
   ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
   aom_partition_features_t features;
-  prepare_sb_features_before_search(cpi, td, mi_row, mi_col, bsize, &features);
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
   features.mi_row = mi_row;
   features.mi_col = mi_col;
   features.frame_width = cpi->frame_info.frame_width;
@@ -4437,7 +4521,7 @@
   const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
 
   // Initialize min to a large value and max to 0 at
-  *var_min = 10.0;
+  *var_min = 99.0;
   *var_max = 0.0;
 
   for (i = 0; i < bh; i += 4) {
@@ -4448,13 +4532,13 @@
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
-                          16);
+                          16.0);
       } else {
         var =
             log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, all_zeros, 0, &sse) /
-                          16);
+                          16.0);
       }
       *var_min = AOMMIN(*var_min, var);
       *var_max = AOMMAX(*var_max, var);
@@ -4579,8 +4663,8 @@
   // av1_get_max_min_partition_features().
   if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
       bsize == cm->seq_params->sb_size) {
-    av1_collect_motion_search_features_sb(cpi, td, mi_row, mi_col, bsize,
-                                          /*features=*/NULL);
+    av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                          bsize, /*features=*/NULL);
     collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
   }
 
@@ -4648,7 +4732,7 @@
     double var_min, var_max;
     log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
 
-    if ((var_min < 0.5) && ((var_max - var_min) > 3.0)) {
+    if ((var_min < 0.272) && ((var_max - var_min) > 3.0)) {
       part_search_state.partition_none_allowed = 0;
       part_search_state.terminate_partition_search = 0;
       part_search_state.do_square_split = 1;
@@ -4687,7 +4771,7 @@
   // newmv mode and is skippable.
   if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
       (pc_tree->none != NULL)) {
-    if (is_inter_mode(pc_tree->none->mic.mode) &&
+    if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
         !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
         pc_tree->none->skippable && !x->must_find_valid_partition &&
         bsize >= BLOCK_16X16)

diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 9e9e8eb..91d1a74 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c

@@ -115,6 +115,7 @@
   snprintf(filename, sizeof(filename), "%s/%s", path,
            get_feature_file_name(id));
   FILE *pfile = fopen(filename, "a");
+  if (pfile == NULL) return;
   if (!is_test_mode) {
     fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
   }
@@ -2296,6 +2297,7 @@
 }
 
 void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
                                            const int mi_row, const int mi_col,
                                            const BLOCK_SIZE bsize,
                                            aom_partition_features_t *features) {
@@ -2308,6 +2310,8 @@
   const int row_step = mi_size_high[fixed_block_size];
   SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
   SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
   av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
                                            mi_col);
   av1_reset_simple_motion_tree_partition(sms_root, bsize);

diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index 1958abb..f7daf37 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h

@@ -128,6 +128,7 @@
                              int *ab_partitions_allowed);
 
 void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
                                            const int mi_row, const int mi_col,
                                            const BLOCK_SIZE bsize,
                                            aom_partition_features_t *features);

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index b0125f2..134c6ce 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c

@@ -1002,7 +1002,7 @@
           gf_group->update_type[gf_group->arf_index];
       int is_forward_keyframe = 0;
       av1_temporal_filter(cpi, arf_src_index, arf_update_type,
-                          is_forward_keyframe, NULL);
+                          is_forward_keyframe, NULL, &cpi->ppi->alt_ref_buffer);
       aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer,
                                av1_num_planes(&cpi->common));
     }
@@ -4051,5 +4051,10 @@
       }
     }
   }
+
+  // Update framerate obtained from parallel encode frames
+  if (cpi->common.show_frame &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+    cpi->framerate = cpi->new_framerate;
 #endif
 }

diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 4346caf..676c55d 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c

@@ -28,9 +28,11 @@
                                              int *pri_strength,
                                              int *sec_strength,
                                              int strength_idx) {
-  const int tot_sec_filter = (pick_method >= CDEF_FAST_SEARCH_LVL3)
-                                 ? REDUCED_SEC_STRENGTHS_LVL3
-                                 : CDEF_SEC_STRENGTHS;
+  const int tot_sec_filter =
+      (pick_method == CDEF_FAST_SEARCH_LVL5)
+          ? REDUCED_SEC_STRENGTHS_LVL5
+          : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+                                                    : CDEF_SEC_STRENGTHS);
   const int pri_idx = strength_idx / tot_sec_filter;
   const int sec_idx = strength_idx % tot_sec_filter;
   *pri_strength = pri_idx;
@@ -48,6 +50,10 @@
       *pri_strength = priconv_lvl4[pri_idx];
       *sec_strength = secconv_lvl3[sec_idx];
       break;
+    case CDEF_FAST_SEARCH_LVL5:
+      *pri_strength = priconv_lvl5[pri_idx];
+      *sec_strength = secconv_lvl5[sec_idx];
+      break;
     default: assert(0 && "Invalid CDEF search method");
   }
 }
@@ -153,7 +159,7 @@
                                       CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
   int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
-              pick_method <= CDEF_FAST_SEARCH_LVL4);
+              pick_method <= CDEF_FAST_SEARCH_LVL5);
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
@@ -574,7 +580,7 @@
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int damping = 3 + (cm->quant_params.base_qindex >> 6);
   const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
-                    pick_method <= CDEF_FAST_SEARCH_LVL4);
+                    pick_method <= CDEF_FAST_SEARCH_LVL5);
   const int num_planes = av1_num_planes(cm);
   CdefSearchCtx cdef_search_ctx;
   // Initialize parameters related to CDEF search context.
@@ -642,7 +648,6 @@
     mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
         best_gi;
   }
-
   if (fast) {
     for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
       const int luma_strength = cdef_info->cdef_strengths[j];

diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index 6bea1b0..a287870 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h

@@ -24,6 +24,7 @@
 #define REDUCED_PRI_STRENGTHS_LVL1 8
 #define REDUCED_PRI_STRENGTHS_LVL2 5
 #define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
 #define REDUCED_PRI_STRENGTHS_LVL4 2
 
 #define REDUCED_TOTAL_STRENGTHS_LVL1 \
@@ -34,19 +35,24 @@
   (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
 #define REDUCED_TOTAL_STRENGTHS_LVL4 \
   (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
 #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
 static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2,  3,
                                                               5, 7, 10, 13 };
 static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
 static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
 static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
 static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
   TOTAL_STRENGTHS,
   REDUCED_TOTAL_STRENGTHS_LVL1,
   REDUCED_TOTAL_STRENGTHS_LVL2,
   REDUCED_TOTAL_STRENGTHS_LVL3,
   REDUCED_TOTAL_STRENGTHS_LVL4,
+  REDUCED_TOTAL_STRENGTHS_LVL5,
   TOTAL_STRENGTHS
 };
 

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 7608749..75c1f8d 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c

@@ -84,15 +84,14 @@
 
 static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                                int partial_frame,
-                               const int *last_frame_filter_level,
-                               double *best_cost_ret, int plane, int dir) {
+                               const int *last_frame_filter_level, int plane,
+                               int dir) {
   const AV1_COMMON *const cm = &cpi->common;
   const int min_filter_level = 0;
   const int max_filter_level = av1_get_max_filter_level(cpi);
   int filt_direction = 0;
   int64_t best_err;
   int filt_best;
-  MACROBLOCK *x = &cpi->td.mb;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
@@ -187,18 +186,13 @@
     }
   }
 
-  // Update best error
-  best_err = ss_err[filt_best];
-
-  if (best_cost_ret)
-    *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        x->rdmult, 0, (best_err << 4), cm->seq_params->bit_depth);
   return filt_best;
 }
 
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   struct loopfilter *const lf = &cm->lf;
   (void)sd;
@@ -213,7 +207,7 @@
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
-                                   cm->seq_params->bit_depth);
+                                   seq_params->bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
     const int strength_boost_q_treshold = 0;
@@ -231,7 +225,7 @@
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
-    switch (cm->seq_params->bit_depth) {
+    switch (seq_params->bit_depth) {
       case AOM_BITS_8:
         filt_guess =
             (cm->current_frame.frame_type == KEY_FRAME)
@@ -250,7 +244,7 @@
                "or AOM_BITS_12");
         return;
     }
-    if (cm->seq_params->bit_depth != AOM_BITS_8 &&
+    if (seq_params->bit_depth != AOM_BITS_8 &&
         cm->current_frame.frame_type == KEY_FRAME)
       filt_guess -= 4;
     // TODO(chengchen): retrain the model for Y, U, V filter levels
@@ -273,26 +267,35 @@
       last_frame_filter_level[3] = lf->filter_level_v;
 #endif
     }
+    // The frame buffer last_frame_uf is used to store the non-loop filtered
+    // reconstructed frame in search_filter_level().
+    if (aom_realloc_frame_buffer(
+            &cpi->last_frame_uf, cm->width, cm->height,
+            seq_params->subsampling_x, seq_params->subsampling_y,
+            seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+            cm->features.byte_alignment, NULL, NULL, NULL, 0))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                         "Failed to allocate last frame buffer");
 
     lf->filter_level[0] = lf->filter_level[1] =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                            last_frame_filter_level, NULL, 0, 2);
+                            last_frame_filter_level, 0, 2);
     if (method != LPF_PICK_FROM_FULL_IMAGE_NON_DUAL) {
       lf->filter_level[0] =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 0);
+                              last_frame_filter_level, 0, 0);
       lf->filter_level[1] =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 0, 1);
+                              last_frame_filter_level, 0, 1);
     }
 
     if (num_planes > 1) {
       lf->filter_level_u =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 1, 0);
+                              last_frame_filter_level, 1, 0);
       lf->filter_level_v =
           search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
-                              last_frame_filter_level, NULL, 2, 0);
+                              last_frame_filter_level, 2, 0);
     }
   }
 }

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index eebaf39..9ee0c3a 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c

@@ -412,7 +412,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int max_delta = 16;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
@@ -427,8 +427,7 @@
   if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
       (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
-       !(refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame))) {
+       !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
     // Make sure q is between oscillating Qs to prevent resonance.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
         rc->q_1_frame != rc->q_2_frame) {
@@ -504,7 +503,7 @@
                                          int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   double rcf;
   double rate_correction_factors_kfstd;
   double rate_correction_factors_gfarfstd;
@@ -545,8 +544,7 @@
 #endif
     rcf = rate_correction_factors_rflvl;
   } else {
-    if ((refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame) &&
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
          cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
@@ -580,7 +578,7 @@
                                        double factor, int width, int height) {
   RATE_CONTROL *const rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   int update_default_rcf = 1;
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
@@ -601,8 +599,7 @@
 #endif
     if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
   } else {
-    if ((refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame) &&
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
         !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
          cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
@@ -847,7 +844,7 @@
 static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const unsigned int curr_frame = cpi->common.current_frame.frame_number;
   int active_worst_quality;
   int last_q_key_frame;
@@ -859,9 +856,9 @@
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref && (refresh_frame_flags->golden_frame ||
-                                      refresh_frame_flags->bwd_ref_frame ||
-                                      refresh_frame_flags->alt_ref_frame)) {
+    if (!rc->is_src_frame_alt_ref &&
+        (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+         refresh_frame->alt_ref_frame)) {
       active_worst_quality =
           curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
     } else {
@@ -882,6 +879,8 @@
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const SVC *const svc = &cpi->svc;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
   // Buffer level below which we push active_worst to worst_quality.
   int64_t critical_level = p_rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
@@ -893,10 +892,20 @@
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
   // So for first few frames following key, the qp of that key frame is weighted
-  // into the active_worst_quality setting.
-  ambient_qp = (cm->current_frame.frame_number < 5)
-                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME],
-                            p_rc->avg_frame_qindex[KEY_FRAME])
+  // into the active_worst_quality setting. For SVC the key frame should
+  // correspond to layer (0, 0), so use that for layer context.
+  int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+  if (svc->number_temporal_layers > 1) {
+    int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+    avg_qindex_key = lp_rc->avg_frame_qindex[KEY_FRAME];
+    if (svc->temporal_layer_id == 0)
+      avg_qindex_key =
+          AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+  }
+  ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
                    : p_rc->avg_frame_qindex[INTER_FRAME];
   active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
   if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
@@ -937,7 +946,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int *rtc_minq;
   const int bit_depth = cm->seq_params->bit_depth;
@@ -972,8 +981,7 @@
     }
   } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
              cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
-             (refresh_frame_flags->golden_frame ||
-              refresh_frame_flags->alt_ref_frame)) {
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1130,7 +1138,7 @@
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
 
   assert(has_no_stats_stage(cpi));
@@ -1182,8 +1190,7 @@
       }
     }
   } else if (!rc->is_src_frame_alt_ref &&
-             (refresh_frame_flags->golden_frame ||
-              refresh_frame_flags->alt_ref_frame)) {
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1201,7 +1208,7 @@
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
-          (refresh_frame_flags->alt_ref_frame)
+          (refresh_frame->alt_ref_frame)
               ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
               : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
@@ -1251,8 +1258,7 @@
           &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
           cpi->is_screen_content_type, bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
-               (refresh_frame_flags->golden_frame ||
-                refresh_frame_flags->alt_ref_frame)) {
+               (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
       qdelta = av1_compute_qdelta_by_rate(
           &cpi->rc, current_frame->frame_type, active_worst_quality, 1.75,
           cpi->is_screen_content_type, bit_depth);
@@ -1332,7 +1338,7 @@
 #define STATIC_MOTION_THRESH 95
 static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
                                    int *active_best, int *active_worst,
-                                   int cq_level, int is_fwd_kf) {
+                                   int cq_level) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
@@ -1347,15 +1353,6 @@
     // as q.
     active_best_quality = cq_level;
     active_worst_quality = cq_level;
-  } else if (is_fwd_kf) {
-    // Handle the special case for forward reference key frames.
-    // Increase the boost because this keyframe is used as a forward and
-    // backward reference.
-    int qindex = p_rc->last_boosted_qindex;
-    const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-    const int delta_qindex = av1_compute_qdelta(
-        rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
-    active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
   } else if (p_rc->this_key_frame_forced) {
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
@@ -1437,7 +1434,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int bit_depth = cpi->common.seq_params->bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
@@ -1446,8 +1443,8 @@
   if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
-         (refresh_frame_flags->golden_frame || is_intrl_arf_boost ||
-          refresh_frame_flags->alt_ref_frame))) {
+         (refresh_frame->golden_frame || is_intrl_arf_boost ||
+          refresh_frame->alt_ref_frame))) {
       active_best_quality -=
           (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast);
       active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
@@ -1553,7 +1550,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
   int *inter_minq;
@@ -1568,8 +1565,8 @@
   // TODO(jingning): Consider to rework this hack that covers issues incurred
   // in lightfield setting.
   if (cm->tiles.large_scale) {
-    is_leaf_frame = !(refresh_frame_flags->golden_frame ||
-                      refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost);
+    is_leaf_frame = !(refresh_frame->golden_frame ||
+                      refresh_frame->alt_ref_frame || is_intrl_arf_boost);
   }
   const int is_overlay_frame = rc->is_src_frame_alt_ref;
 
@@ -1658,13 +1655,10 @@
   int active_best_quality = 0;
   int active_worst_quality = rc->active_worst_quality;
   int q;
-  GF_GROUP *gf_group = &cpi->ppi->gf_group;
 
   if (frame_is_intra_only(cm)) {
-    const int is_fwd_kf = gf_group->update_type[gf_index] == ARF_UPDATE &&
-                          gf_group->refbuf_state[gf_index] == REFBUF_UPDATE;
     get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
-                           &active_worst_quality, cq_level, is_fwd_kf);
+                           &active_worst_quality, cq_level);
   } else {
     //  Active best quality limited by previous layer.
     active_best_quality =
@@ -1715,7 +1709,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   assert(IMPLIES(has_no_stats_stage(cpi),
                  cpi->oxcf.rc_cfg.mode == AOM_Q &&
@@ -1737,10 +1731,8 @@
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
 
   if (frame_is_intra_only(cm)) {
-    const int is_fwd_kf = gf_group->update_type[gf_index] == ARF_UPDATE &&
-                          gf_group->refbuf_state[gf_index] == REFBUF_UPDATE;
     get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
-                           &active_worst_quality, cq_level, is_fwd_kf);
+                           &active_worst_quality, cq_level);
 #ifdef STRICT_RC
     active_best_quality = 0;
 #endif
@@ -1767,8 +1759,8 @@
     // leaf (non arf) frames. This is important to the TPL model which assumes
     // Q drops with each arf level.
     if (!(rc->is_src_frame_alt_ref) &&
-        (refresh_frame_flags->golden_frame ||
-         refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost)) {
+        (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
+         is_intrl_arf_boost)) {
       active_worst_quality =
           (active_best_quality + (3 * active_worst_quality) + 2) / 4;
     }
@@ -1885,7 +1877,7 @@
   RATE_CONTROL *const rc = &cpi->rc;
   PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   const int is_intrnl_arf =
       gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
@@ -1910,8 +1902,8 @@
   } else {
     if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         (!rc->is_src_frame_alt_ref &&
-         !(refresh_frame_flags->golden_frame || is_intrnl_arf ||
-           refresh_frame_flags->alt_ref_frame))) {
+         !(refresh_frame->golden_frame || is_intrnl_arf ||
+           refresh_frame->alt_ref_frame))) {
       p_rc->last_q[INTER_FRAME] = qindex;
       p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
           3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
@@ -1932,8 +1924,8 @@
   if ((qindex < p_rc->last_boosted_qindex) ||
       (current_frame->frame_type == KEY_FRAME) ||
       (!p_rc->constrained_gf_group &&
-       (refresh_frame_flags->alt_ref_frame || is_intrnl_arf ||
-        (refresh_frame_flags->golden_frame && !rc->is_src_frame_alt_ref)))) {
+       (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+        (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
     p_rc->last_boosted_qindex = qindex;
   }
   if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
@@ -1960,7 +1952,7 @@
 
   if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
                         cpi->oxcf.gf_cfg.enable_auto_arf) &&
-      refresh_frame_flags->alt_ref_frame &&
+      refresh_frame->alt_ref_frame &&
       (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
@@ -2284,6 +2276,9 @@
     target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
                  ? INT_MAX
                  : (int)(p_rc->starting_buffer_level / 2);
+    if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+      target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+    }
   } else {
     int kf_boost = 32;
     double framerate = cpi->framerate;
@@ -2297,6 +2292,93 @@
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_set_golden_update(cpi);
+  else
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key &&
+      cpi->oxcf.kf_cfg.auto_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key &&
+       cpi->oxcf.kf_cfg.auto_key)
+          ? 1
+          : 0;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+  cpi->gf_frame_index = 0;
+  // SVC does not use GF as periodic boost.
+  // TODO(marpan): Find better way to disable this for SVC.
+  if (cpi->ppi->use_svc) {
+    SVC *const svc = &cpi->svc;
+    p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+    p_rc->gfu_boost = 1;
+    p_rc->constrained_gf_group = 0;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    for (int layer = 0;
+         layer < svc->number_spatial_layers * svc->number_temporal_layers;
+         ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+      lc->p_rc.gfu_boost = p_rc->gfu_boost;
+      lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+      lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+      lc->group_index = 0;
+    }
+  }
+  gf_group->size = p_rc->baseline_gf_interval;
+  gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+  gf_group->refbuf_state[cpi->gf_frame_index] =
+      (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  if (!resize_pending && !rc->high_source_sad) {
+    // Check if we should disable GF refresh (if period is up),
+    // or force a GF refresh update (if we are at least halfway through
+    // period) based on QP. Look into add info on segment deltaq.
+    PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+    const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+    const int allow_gf_update =
+        rc->frames_till_gf_update_due <= (p_rc->baseline_gf_interval - 10);
+    int gf_update_changed = 0;
+    int thresh = 87;
+    if (rc->frames_till_gf_update_due == 1 &&
+        cm->quant_params.base_qindex > avg_qp) {
+      // Disable GF refresh since QP is above the runninhg average QP.
+      svc->refresh[svc->gld_idx_1layer] = 0;
+      gf_update_changed = 1;
+    } else if (allow_gf_update &&
+               ((cm->quant_params.base_qindex < thresh * avg_qp / 100) ||
+                (rc->avg_frame_low_motion < 20))) {
+      // Force refresh since QP is well below average QP or this is a high
+      // motion frame.
+      svc->refresh[svc->gld_idx_1layer] = 1;
+      gf_update_changed = 1;
+    }
+    if (gf_update_changed) {
+      set_baseline_gf_interval(cpi, INTER_FRAME);
+      int refresh_mask = 0;
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = svc->ref_idx[i];
+        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+      }
+      cm->current_frame.refresh_frame_flags = refresh_mask;
+    }
+  }
+}
+
 /*!\brief Setup the reference prediction structure for 1 pass real-time
  *
  * Set the reference prediction structure for 1 layer.
@@ -2338,8 +2420,7 @@
   // Set the reference frame flags.
   ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
   ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
-  if (cpi->sf.rt_sf.use_golden_frame)
-    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
   if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
     ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
   const int sh = 7 - gld_fixed_slot;
@@ -2379,6 +2460,7 @@
     ext_refresh_frame_flags->golden_frame = 1;
     svc->refresh[gld_idx] = 1;
   }
+  svc->gld_idx_1layer = gld_idx;
 }
 
 /*!\brief Check for scene detection, for 1 pass real-time mode.
@@ -2491,9 +2573,6 @@
   }
 }
 
-#define DEFAULT_KF_BOOST_RT 2300
-#define DEFAULT_GF_BOOST_RT 2000
-
 /*!\brief Set the GF baseline interval for 1 pass real-time mode.
  *
  *
@@ -2507,55 +2586,14 @@
 static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
                                              FRAME_TYPE frame_type) {
   RATE_CONTROL *const rc = &cpi->rc;
-  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
-  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
   int gf_update = 0;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
   // GF update based on frames_till_gf_update_due, also
   // force upddate on resize pending frame or for scene change.
   if ((resize_pending || rc->high_source_sad ||
        rc->frames_till_gf_update_due == 0) &&
       cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
-    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
-      av1_cyclic_refresh_set_golden_update(cpi);
-    else
-      p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
-    if (p_rc->baseline_gf_interval > rc->frames_to_key)
-      p_rc->baseline_gf_interval = rc->frames_to_key;
-    p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
-    p_rc->constrained_gf_group =
-        (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
-    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
-    cpi->gf_frame_index = 0;
-    // SVC does not use GF as periodic boost.
-    // TODO(marpan): Find better way to disable this for SVC.
-    if (cpi->ppi->use_svc) {
-      SVC *const svc = &cpi->svc;
-      p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
-      p_rc->gfu_boost = 1;
-      p_rc->constrained_gf_group = 0;
-      rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
-      for (int layer = 0;
-           layer < svc->number_spatial_layers * svc->number_temporal_layers;
-           ++layer) {
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
-        lc->p_rc.gfu_boost = p_rc->gfu_boost;
-        lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
-        lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
-        lc->group_index = 0;
-      }
-    }
-    gf_group->size = p_rc->baseline_gf_interval;
-    gf_group->update_type[0] =
-        (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
-    gf_group->refbuf_state[cpi->gf_frame_index] =
-        (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+    set_baseline_gf_interval(cpi, frame_type);
     gf_update = 1;
   }
   return gf_update;
@@ -2714,6 +2752,30 @@
   return;
 }
 
+static INLINE int set_key_frame(AV1_COMP *cpi, unsigned int frame_flags) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  AV1_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  // Very first frame has to be key frame.
+  if (cm->current_frame.frame_number == 0) return 1;
+  // Set key frame if forced by frame flags.
+  if (frame_flags & FRAMEFLAGS_KEY) return 1;
+  if (!cpi->ppi->use_svc) {
+    // Non-SVC
+    if (cpi->oxcf.kf_cfg.auto_key && rc->frames_to_key == 0) return 1;
+  } else {
+    // SVC
+    if (svc->spatial_layer_id == 0 &&
+        (cpi->oxcf.kf_cfg.auto_key &&
+         (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
+          svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)))
+      return 1;
+  }
+
+  return 0;
+}
+
 void av1_get_one_pass_rt_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags) {
@@ -2735,11 +2797,7 @@
     av1_restore_layer_context(cpi);
   }
   // Set frame type.
-  if ((!cpi->ppi->use_svc && rc->frames_to_key == 0) ||
-      (cpi->ppi->use_svc && svc->spatial_layer_id == 0 &&
-       (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
-        svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)) ||
-      (frame_flags & FRAMEFLAGS_KEY)) {
+  if (set_key_frame(cpi, frame_flags)) {
     frame_params->frame_type = KEY_FRAME;
     p_rc->this_key_frame_forced =
         cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
@@ -2785,9 +2843,7 @@
       resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
       resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
     }
-  } else if (resize_pending_params->width && resize_pending_params->height &&
-             (cpi->common.width != resize_pending_params->width ||
-              cpi->common.height != resize_pending_params->height)) {
+  } else if (is_frame_resize_pending(cpi)) {
     resize_reset_rc(cpi, resize_pending_params->width,
                     resize_pending_params->height, cm->width, cm->height);
   }
@@ -2886,16 +2942,13 @@
  * Intended to be used only with AOM_Q mode.
  */
 void av1_q_mode_compute_gop_q_indices(int gf_frame_index, int base_q_index,
-                                      double arf_qstep_ratio,
+                                      const double *qstep_ratio_list,
                                       aom_bit_depth_t bit_depth,
                                       const struct GF_GROUP *gf_group,
                                       int *q_index_list) {
-  const int arf_q = av1_get_q_index_from_qstep_ratio(
-      base_q_index, arf_qstep_ratio, bit_depth);
-  for (int gf_index = gf_frame_index; gf_index < gf_group->size; ++gf_index) {
-    const int height = gf_group_pyramid_level(gf_group, gf_index);
-    q_index_list[gf_index] = av1_q_mode_get_q_index(
-        base_q_index, gf_group->update_type[gf_index], height, arf_q);
+  for (int i = gf_frame_index; i < gf_group->size; ++i) {
+    q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+        base_q_index, qstep_ratio_list[i], bit_depth);
   }
 }
 #endif  // !CONFIG_REALTIME_ONLY

diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index f668968..651f19b 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h

@@ -46,6 +46,9 @@
 #define FIXED_GF_INTERVAL 16
 #define MAX_GF_LENGTH_LAP 16
 
+#define FIXED_GF_INTERVAL_RT 80
+#define MAX_GF_INTERVAL_RT 160
+
 #define MAX_NUM_GF_INTERVALS 15
 
 #define MAX_ARF_LAYERS 6
@@ -564,6 +567,8 @@
 void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
                              int height);
 
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
 void av1_set_reference_structure_one_pass_rt(struct AV1_COMP *cpi,
                                              int gf_update);
 
@@ -661,8 +666,7 @@
  *
  * \param[in]       gf_frame_index    Index of the current frame
  * \param[in]       base_q_index      Base q index
- * \param[in]       arf_qstep_ratio   The quantize step ratio between arf q
- *                                    index and base q index
+ * \param[in]       qstep_ratio_list  Stores the qstep_ratio for each frame
  * \param[in]       bit_depth         Bit depth
  * \param[in]       gf_group          Pointer to the GOP
  * \param[out]      q_index_list      An array to store output gop q indices.
@@ -670,7 +674,7 @@
  *                                    greater than gf_group.size()
  */
 void av1_q_mode_compute_gop_q_indices(int gf_frame_index, int base_q_index,
-                                      double arf_qstep_ratio,
+                                      const double *qstep_ratio_list,
                                       aom_bit_depth_t bit_depth,
                                       const struct GF_GROUP *gf_group,
                                       int *q_index_list);

diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index d88f563..8d28f43 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c

@@ -620,6 +620,9 @@
 
 void av1_fill_mv_costs(const nmv_context *nmvc, int integer_mv, int usehp,
                        MvCosts *mv_costs) {
+  // Avoid accessing 'mv_costs' when it is not allocated.
+  if (mv_costs == NULL) return;
+
   mv_costs->nmv_cost[0] = &mv_costs->nmv_cost_alloc[0][MV_MAX];
   mv_costs->nmv_cost[1] = &mv_costs->nmv_cost_alloc[1][MV_MAX];
   mv_costs->nmv_cost_hp[0] = &mv_costs->nmv_cost_hp_alloc[0][MV_MAX];

diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 93ab739..7256ff9 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h

@@ -299,37 +299,8 @@
 }
 
 // Used to reset the state of tx/mb rd hash information
-static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info,
-                                      int use_inter_txb_hash) {
-  int32_t record_idx;
+static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info) {
   if (!txfm_info->txb_rd_records) return;
-  // Reset the state for use_inter_txb_hash
-  if (use_inter_txb_hash) {
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].index_start =
-              0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx]
-              .index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx]
-              .index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx]
-              .index_start = 0;
-  }
-
-  // Reset the state for use_intra_txb_hash
-  txfm_info->txb_rd_records->txb_rd_record_intra.num =
-      txfm_info->txb_rd_records->txb_rd_record_intra.index_start = 0;
 
   // Reset the state for use_mb_rd_hash
   txfm_info->txb_rd_records->mb_rd_record.num =

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index e6cf62b..c346c92 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -1167,12 +1167,10 @@
   return 0;
 }
 
-static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
-                                               int *mode_index_start,
-                                               int *mode_index_end,
-                                               int last_motion_mode_allowed,
-                                               int interintra_allowed,
-                                               int eval_motion_mode) {
+static INLINE void update_mode_start_end_index(
+    const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+    int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+    int interintra_allowed, int eval_motion_mode) {
   *mode_index_start = (int)SIMPLE_TRANSLATION;
   *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
   if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
@@ -1184,6 +1182,8 @@
       *mode_index_start = 1;
     }
   }
+  if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+    *mode_index_end = SIMPLE_TRANSLATION;
 }
 
 /*!\brief AV1 motion mode search
@@ -1328,7 +1328,7 @@
   // if SIMPLE_TRANSLATION has already been searched according to
   // the motion_mode_for_winner_cand speed feature, update the mode_index_start
   // to avoid searching it again.
-  update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
+  update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
                               last_motion_mode_allowed, interintra_allowed,
                               eval_motion_mode);
   // Main function loop. This loops over all of the possible motion modes and
@@ -1758,12 +1758,13 @@
   }
 
   // nearest_refmv_count indicates the closeness of block motion characteristics
-  // with respect to its spatial neighbor. Lower value of nearest_refmv_count
-  // means less correlation with its spatial neighbors. Hence less possibility
-  // for NEARESTMV and NEARMV modes becoming the best mode since these modes
-  // work well for blocks that shares similar motion characteristics with its
-  // neighbor. Thus, when nearest_refmv_count is less w.r.t ref_mv_count prune
-  // the mode.
+  // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+  // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+  // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+  // mode since these modes work well for blocks that shares similar motion
+  // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+  // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+  // mode is pruned if none of the ref mvs are populated from nearest candidate.
   const int prune_thresh = 1 + (ref_mv_count >= 2);
   if (nearest_refmv_count < prune_thresh) return 1;
   return 0;
@@ -2661,6 +2662,16 @@
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
 
+  // As per the experiments, in real-time preset impact of model rd based
+  // breakouts is less on encoding time if the following conditions are true.
+  //    (1) compound mode is disabled
+  //    (2) interpolation filter search is disabled
+  // TODO(any): Check the impact of model rd based breakouts in other presets
+  const int skip_interp_search_modelrd_calc =
+      cpi->oxcf.mode == REALTIME &&
+      cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+      cpi->sf.rt_sf.skip_interp_filter_search;
+
   for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
     save_mv[i][0].as_int = INVALID_MV;
     save_mv[i][1].as_int = INVALID_MV;
@@ -2746,6 +2757,15 @@
         if (this_sse < args->best_single_sse_in_refs[ref]) {
           args->best_single_sse_in_refs[ref] = this_sse;
         }
+
+        if (cpi->sf.rt_sf.skip_newmv_mode_based_on_sse) {
+          const double scale_factor[11] = { 0.7, 0.7, 0.7, 0.7, 0.7, 0.8,
+                                            0.8, 0.9, 0.9, 0.9, 0.9 };
+          assert(num_pels_log2_lookup[bsize] >= 4);
+          if (args->best_pred_sse <
+              scale_factor[num_pels_log2_lookup[bsize] - 4] * this_sse)
+            continue;
+        }
       }
 
       rd_stats->rate += rate_mv;
@@ -2795,39 +2815,41 @@
       if (not_best_mode) continue;
     }
 
+    if (!skip_interp_search_modelrd_calc) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, interpolation_filter_search_time);
+      start_timing(cpi, interpolation_filter_search_time);
 #endif
-    // Determine the interpolation filter for this mode
-    ret_val = av1_interpolation_filter_search(
-        x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
-        &skip_build_pred, args, ref_best_rd);
+      // Determine the interpolation filter for this mode
+      ret_val = av1_interpolation_filter_search(
+          x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+          &skip_build_pred, args, ref_best_rd);
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, interpolation_filter_search_time);
+      end_timing(cpi, interpolation_filter_search_time);
 #endif
-    if (args->modelled_rd != NULL && !is_comp_pred) {
-      args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
-    }
-    if (ret_val != 0) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
-               ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    }
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
 
-    // Compute modelled RD if enabled
-    if (args->modelled_rd != NULL) {
-      if (is_comp_pred) {
-        const int mode0 = compound_ref0_mode(this_mode);
-        const int mode1 = compound_ref1_mode(this_mode);
-        const int64_t mrd =
-            AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
-                   args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
-        if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
-          restore_dst_buf(xd, orig_dst, num_planes);
-          continue;
+      // Compute modelled RD if enabled
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
         }
       }
     }
@@ -3667,6 +3689,13 @@
     disable_reference(INTRA_FRAME, mask->ref_combo);
   }
 
+  if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+      mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+    }
+  }
+
   mask->pred_modes[INTRA_FRAME] |=
       ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
 }
@@ -5379,7 +5408,8 @@
                                -1,
                                -1,
                                { 0 },
-                               { 0 } };
+                               { 0 },
+                               UINT_MAX };
   for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
   // Indicates the appropriate number of simple translation winner modes for
   // exhaustive motion mode evaluation
@@ -5598,6 +5628,7 @@
     args.single_newmv_valid = search_state.single_newmv_valid;
     args.single_comp_cost = real_compmode_cost;
     args.ref_frame_cost = ref_frame_cost;
+    args.best_pred_sse = search_state.best_pred_sse;
 
     int64_t skip_rd[2] = { search_state.best_skip_rd[0],
                            search_state.best_skip_rd[1] };

diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index 059ddb1..8dacc2e 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h

@@ -553,14 +553,7 @@
       set_tx_type_prune(sf, txfm_params,
                         sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
                         1);
-
-      // Reset hash state for winner mode processing. Winner mode and subsequent
-      // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
-      // the decisions would have been sub-optimal
-      // TODO(any): Move the evaluation of palette/IntraBC modes before winner
-      // mode is processed and clean-up the code below
-      reset_hash_records(txfm_info, cpi->sf.tx_sf.use_inter_txb_hash);
-
+      reset_hash_records(txfm_info);
       break;
     default: assert(0);
   }

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index cb2644c..dc6c2ae 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c

@@ -519,12 +519,6 @@
     sf->rt_sf.nonrd_check_partition_merge_mode = 0;
     sf->rt_sf.hybrid_intra_pickmode = 0;
   }
-
-  // Intra txb hash is currently not compatible with multi-winner mode as the
-  // hashes got reset during multi-winner mode processing.
-  assert(IMPLIES(
-      sf->winner_mode_sf.multi_winner_mode_type != MULTI_WINNER_MODE_OFF,
-      !sf->tx_sf.use_intra_txb_hash));
 }
 
 static void set_good_speed_feature_framesize_dependent(
@@ -663,7 +657,7 @@
     if (is_720p_or_larger) {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
       sf->part_sf.partition_search_breakout_rate_thr = 200;
-      sf->part_sf.skip_non_sq_part_based_on_none = boosted ? 0 : 2;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
     } else {
       sf->part_sf.max_intra_bsize = BLOCK_32X32;
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
@@ -924,7 +918,6 @@
     // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
     // bit more closely to figure out why.
     sf->inter_sf.adaptive_rd_thresh = 1;
-    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
     sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
     sf->inter_sf.fast_interintra_wedge_search = 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
@@ -954,6 +947,7 @@
     sf->tpl_sf.search_method = DIAMOND;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+    sf->rd_sf.use_mb_rd_hash = 1;
 
     sf->lpf_sf.prune_wiener_based_on_src_var = 1;
     sf->lpf_sf.prune_sgr_based_on_wiener = 1;
@@ -1017,7 +1011,6 @@
 
     sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
     sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
-    sf->tx_sf.use_intra_txb_hash = 1;
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
 
@@ -1083,9 +1076,6 @@
 
     sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    // TODO(any): Experiment with enabling of this speed feature as hash state
-    // is reset during winner mode processing
-    sf->tx_sf.use_intra_txb_hash = 0;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
     sf->rd_sf.tx_domain_dist_thres_level = 2;
@@ -1168,8 +1158,6 @@
     sf->tpl_sf.disable_filtered_key_tpl = 1;
 
     sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
-    sf->tx_sf.use_intra_txb_hash = 1;
-
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
 
     sf->winner_mode_sf.dc_blk_pred_level = 2;
@@ -1179,12 +1167,6 @@
 
     sf->fp_sf.skip_zeromv_motion_search = 1;
   }
-
-  // Intra txb hash is currently not compatible with multi-winner mode as the
-  // hashes got reset during multi-winner mode processing.
-  assert(IMPLIES(
-      sf->winner_mode_sf.multi_winner_mode_type != MULTI_WINNER_MODE_OFF,
-      !sf->tx_sf.use_intra_txb_hash));
 }
 
 static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
@@ -1200,6 +1182,7 @@
 
   if (!is_360p_or_larger) {
     if (speed >= 6) sf->rt_sf.force_tx_search_off = 1;
+    if (speed >= 7) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     if (speed >= 8) {
       sf->rt_sf.use_modeled_non_rd_cost = 0;
       sf->rt_sf.use_nonrd_filter_search = 0;
@@ -1232,6 +1215,7 @@
       sf->rt_sf.use_nonrd_altref_frame = 1;
     }
     if (speed >= 9) {
+      sf->rt_sf.gf_length_lvl = 1;
       sf->rt_sf.skip_cdef_sb = 1;
     }
   }
@@ -1290,7 +1274,6 @@
   sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
   sf->inter_sf.reduce_inter_modes = 1;
   sf->inter_sf.reuse_inter_intra_mode = 1;
-  sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
   sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
   sf->inter_sf.fast_wedge_sign_estimate = 1;
   sf->inter_sf.prune_comp_type_by_comp_avg = 2;
@@ -1328,8 +1311,10 @@
   sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
   sf->part_sf.use_best_rd_for_pruning = 1;
   sf->part_sf.prune_ext_partition_types_search_level = 2;
-  sf->part_sf.partition_search_breakout_rate_thr = 80;
   sf->part_sf.less_rectangular_check_level = 2;
+  sf->part_sf.early_term_after_none_split = 1;
+  sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+  sf->part_sf.partition_search_breakout_rate_thr = 200;
 
   sf->rd_sf.tx_domain_dist_thres_level = 1;
   sf->rd_sf.tx_domain_dist_level = 1;
@@ -1350,14 +1335,10 @@
   sf->tx_sf.tx_size_search_lgr_block = 1;
   sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
   sf->tx_sf.tx_type_search.skip_tx_search = 1;
-  sf->tx_sf.use_intra_txb_hash = 1;
   sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
   sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
   sf->tx_sf.model_based_prune_tx_search_level = 0;
   sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
-  // TODO(any, yunqing): somehow this is needed by sf->rt_sf.use_nonrd_pick_mode
-  // at speed 7? Need more investigation.
-  sf->tx_sf.use_intra_txb_hash = 0;
 
   sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
 
@@ -1401,7 +1382,6 @@
     sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
 
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
-    sf->tx_sf.use_inter_txb_hash = 0;
     sf->tx_sf.refine_fast_tx_search_results = 0;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
@@ -1414,7 +1394,7 @@
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
     sf->lpf_sf.disable_lr_filter = 1;
 
-    sf->winner_mode_sf.dc_blk_pred_level = 1;
+    sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 2;
     sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
     sf->winner_mode_sf.tx_size_search_level = 1;
 
@@ -1435,20 +1415,31 @@
     if (cpi->oxcf.kf_cfg.key_freq_max != 0 &&
         cm->width * cm->height > 640 * 480)
       sf->rt_sf.use_temporal_noise_estimate = 1;
-    sf->rt_sf.use_golden_frame = 0;
     sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+    sf->rt_sf.skip_newmv_mode_based_on_sse = 1;
+
+    // For SVC: use better mv search on base temporal layers, and only
+    // on base spatial layer if highest resolution is above 640x360.
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 6;
+    }
   }
 
   if (speed >= 6) {
     sf->mv_sf.use_fullpel_costlist = 1;
 
     sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
-
-    sf->part_sf.adjust_var_based_rd_partitioning = 1;
+    sf->inter_sf.prune_warped_prob_thresh = 8;
+    sf->inter_sf.extra_prune_warped = 1;
   }
 
   if (speed >= 7) {
-    sf->rt_sf.use_golden_frame = 1;
     sf->part_sf.default_max_partition_size = BLOCK_128X128;
     sf->part_sf.default_min_partition_size = BLOCK_8X8;
     sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
@@ -1469,8 +1460,8 @@
       sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
     }
 
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->rt_sf.nonrd_prune_ref_frame_search = 1;
@@ -1487,26 +1478,26 @@
     sf->rt_sf.nonrd_check_partition_merge_mode = 1;
     sf->rt_sf.nonrd_check_partition_split = 0;
     sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
-    // For SVC: use better mv search on base temporal layer, and only
+    // For SVC: use better mv search on base temporal layers, and only
     // on base spatial layer if highest resolution is above 640x360.
-    if (cpi->svc.number_temporal_layers > 1) {
-      if (cpi->svc.temporal_layer_id == 0 &&
-          (cpi->svc.spatial_layer_id == 0 ||
-           cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
-               640 * 360)) {
-        sf->mv_sf.search_method = NSTEP;
-        sf->mv_sf.subpel_search_method = SUBPEL_TREE;
-        sf->rt_sf.fullpel_search_step_param = 6;
-      } else if (cpi->svc.non_reference_frame) {
-        sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-        sf->rt_sf.fullpel_search_step_param = 10;
-      }
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 6;
+    } else if (cpi->svc.non_reference_frame) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      sf->rt_sf.fullpel_search_step_param = 10;
     }
     // TODO(marpan): Look into why enabling skip_loopfilter_non_reference is
     // not bitexact on rtc testset, its very close (< ~0.01 bdrate), but not
     // always bitexact.
     if (cpi->ppi->use_svc && cpi->svc.non_reference_frame &&
-        sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+        (sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q ||
+         sf->lpf_sf.cdef_pick_method == CDEF_FAST_SEARCH_LVL5) &&
         sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q)
       sf->rt_sf.skip_loopfilter_non_reference = 1;
     // Set mask for intra modes.
@@ -1543,6 +1534,7 @@
     sf->interp_sf.cb_pred_filter_search = 1;
   }
   if (speed >= 9) {
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->rt_sf.estimate_motion_for_var_based_partition = 0;
     sf->rt_sf.force_large_partition_blocks = 1;
     for (int i = 0; i < BLOCK_SIZES; ++i)
@@ -1553,6 +1545,7 @@
     sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
     sf->rt_sf.nonrd_agressive_skip = 1;
     sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
   }
 }
 
@@ -1659,7 +1652,6 @@
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
-  inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   inter_sf->adaptive_rd_thresh = 0;
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
@@ -1756,8 +1748,6 @@
   tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
   tx_sf->adaptive_txb_search_level = 0;
-  tx_sf->use_intra_txb_hash = 0;
-  tx_sf->use_inter_txb_hash = 1;
   tx_sf->refine_fast_tx_search_results = 1;
   tx_sf->prune_tx_size_level = 0;
 }
@@ -1784,7 +1774,7 @@
   } else {
     assert(0 && "Invalid disable_trellis_quant value");
   }
-  rd_sf->use_mb_rd_hash = 1;
+  rd_sf->use_mb_rd_hash = 0;
   rd_sf->simple_model_rd_from_var = 0;
   rd_sf->tx_domain_dist_level = 0;
   rd_sf->tx_domain_dist_thres_level = 0;
@@ -1828,7 +1818,6 @@
   rt_sf->nonrd_prune_ref_frame_search = 0;
   rt_sf->use_nonrd_pick_mode = 0;
   rt_sf->use_nonrd_altref_frame = 0;
-  rt_sf->use_golden_frame = 0;
   rt_sf->use_comp_ref_nonrd = 0;
   rt_sf->use_real_time_ref_set = 0;
   rt_sf->short_circuit_low_temp_var = 0;
@@ -1853,6 +1842,8 @@
   rt_sf->skip_cdef_sb = 0;
   rt_sf->force_large_partition_blocks_intra = 0;
   rt_sf->skip_tx_no_split_var_based_partition = 0;
+  rt_sf->skip_newmv_mode_based_on_sse = 0;
+  rt_sf->gf_length_lvl = 0;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
@@ -2057,7 +2048,16 @@
   const int is_arf2_bwd_type =
       cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
-  if (cpi->oxcf.mode == REALTIME) return;
+  if (cpi->oxcf.mode == REALTIME) {
+    if (speed >= 6) {
+      const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+      sf->part_sf.adjust_var_based_rd_partitioning =
+          frame_is_intra_only(cm)
+              ? 0
+              : cm->quant_params.base_qindex > qindex_thresh;
+    }
+    return;
+  }
 
   if (speed == 0) {
     // qindex_thresh for resolution < 720p
@@ -2090,7 +2090,6 @@
         sf->interp_sf.cb_pred_filter_search = 0;
         sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
         sf->tx_sf.tx_type_search.skip_tx_search = 1;
-        sf->tx_sf.use_intra_txb_hash = 1;
       }
     }
   }

diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index c1861d7..41a56a6 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h

@@ -160,6 +160,7 @@
   CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
                               Level 2. */
   CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+  CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
   CDEF_PICK_FROM_Q,      /**< Estimate filter strength based on quantizer. */
   CDEF_PICK_METHODS
 } CDEF_PICK_METHOD;
@@ -793,12 +794,6 @@
   // same single inter mode as a group.
   int prune_comp_search_by_single_result;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
-
   // Instead of performing a full MV search, do a simple translation first
   // and only perform a full MV search on the motion vectors that performed
   // well.
@@ -914,6 +909,9 @@
   // Cap the no. of txfm searches for a given prediction mode.
   // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
   int limit_txfm_eval_per_mode;
+
+  // Prune warped motion search based on block size.
+  int extra_prune_warped;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
@@ -1072,16 +1070,6 @@
   // 1-2: progressively increasing aggressiveness of pruning
   int model_based_prune_tx_search_level;
 
-  // Use hash table to store intra(keyframe only) txb transform search results
-  // to avoid repeated search on the same residue signal. This is currently not
-  // compatible with multi-winner mode as the hash states are reset during
-  // winner mode processing.
-  int use_intra_txb_hash;
-
-  // Use hash table to store inter txb transform search results
-  // to avoid repeated search on the same residue signal.
-  int use_inter_txb_hash;
-
   // Refine TX type after fast TX search.
   int refine_fast_tx_search_results;
 
@@ -1233,9 +1221,6 @@
   // Use ALTREF frame in non-RD mode decision.
   int use_nonrd_altref_frame;
 
-  // Use GOLDEN frame in pickmode decision.
-  int use_golden_frame;
-
   // Use compound reference for non-RD mode.
   int use_comp_ref_nonrd;
 
@@ -1320,6 +1305,14 @@
 
   // Skip evaluation of no split in tx size selection for merge partition
   int skip_tx_no_split_var_based_partition;
+
+  // Intermediate termination of newMV mode evaluation based on so far best mode
+  // sse
+  int skip_newmv_mode_based_on_sse;
+
+  // Define gf length multiplier.
+  // Level 0: use large multiplier, level 1: use medium multiplier.
+  int gf_length_lvl;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */

diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 67b3082..4e48218 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c

@@ -62,7 +62,6 @@
       // (i.e., ss_number_layers > 1), these need to be updated per spatial
       // layer. Cyclic refresh is only applied on base temporal layer.
       if (svc->number_spatial_layers > 1 && tl == 0) {
-        size_t last_coded_q_map_size;
         lc->sb_index = 0;
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
@@ -71,13 +70,6 @@
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
-        last_coded_q_map_size =
-            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
-                        aom_malloc(last_coded_q_map_size));
-        assert(MAXQ <= 255);
-        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
       }
     }
     svc->downsample_filter_type[sl] = BILINEAR;
@@ -195,7 +187,6 @@
       svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     swap_ptr(&cr->map, &lc->map);
-    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
     cr->sb_index = lc->sb_index;
     cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
     cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
@@ -234,11 +225,8 @@
       cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = lc->map;
-    uint8_t *temp2 = lc->last_coded_q_map;
     lc->map = cr->map;
     cr->map = temp;
-    lc->last_coded_q_map = cr->last_coded_q_map;
-    cr->last_coded_q_map = temp2;
     lc->sb_index = cr->sb_index;
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
@@ -301,7 +289,6 @@
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       if (lc->map) aom_free(lc->map);
-      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
     }
   }
 }

diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index a1dff40..310d08a 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h

@@ -49,11 +49,6 @@
    */
   int8_t *map;
   /*!
-   * Segmentation map for last coded quantization paramters.
-   */
-  uint8_t *last_coded_q_map;
-
-  /*!
    * Number of blocks on segment 1
    */
   int actual_num_seg1_blocks;
@@ -110,6 +105,7 @@
   /*!\cond */
   int ref_idx[INTER_REFS_PER_FRAME];
   int refresh[REF_FRAMES];
+  int gld_idx_1layer;
   double base_framerate;
   unsigned int current_superframe;
   unsigned int buffer_time_index[REF_FRAMES];

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index d0648df..b229d43 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -861,13 +861,13 @@
       }
     }
     tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
-                                accum, count, &cpi->ppi->alt_ref_buffer);
+                                accum, count, tf_ctx->output_frame);
 
     if (check_show_existing) {
       const int y_height = mb_height >> mbd->plane[0].subsampling_y;
       const int y_width = mb_width >> mbd->plane[0].subsampling_x;
       const int source_y_stride = frame_to_filter->y_stride;
-      const int filter_y_stride = cpi->ppi->alt_ref_buffer.y_stride;
+      const int filter_y_stride = tf_ctx->output_frame->y_stride;
       const int source_offset =
           mb_row * y_height * source_y_stride + mb_col * y_width;
       const int filter_offset =
@@ -875,7 +875,7 @@
       unsigned int sse = 0;
       cpi->ppi->fn_ptr[block_size].vf(
           frame_to_filter->y_buffer + source_offset, source_y_stride,
-          cpi->ppi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride,
+          tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
           &sse);
       diff->sum += sse;
       diff->sse += sse * (int64_t)sse;
@@ -1130,12 +1130,14 @@
 //   Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
 static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
                         int is_second_arf, FRAME_UPDATE_TYPE update_type,
-                        int is_forward_keyframe) {
+                        int is_forward_keyframe,
+                        YV12_BUFFER_CONFIG *output_frame) {
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   // Setup frame buffer for filtering.
   YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
   tf_ctx->num_frames = 0;
   tf_ctx->filter_frame_idx = -1;
+  tf_ctx->output_frame = output_frame;
   tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
                             update_type, is_forward_keyframe);
   assert(tf_ctx->num_frames > 0);
@@ -1184,7 +1186,8 @@
 
 int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
                         FRAME_UPDATE_TYPE update_type, int is_forward_keyframe,
-                        int *show_existing_arf) {
+                        int *show_existing_arf,
+                        YV12_BUFFER_CONFIG *output_frame) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   // Basic informaton of the current frame.
   const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
@@ -1211,7 +1214,7 @@
 
   // Initialize temporal filter context structure.
   init_tf_ctx(cpi, filter_frame_lookahead_idx, is_second_arf, update_type,
-              is_forward_keyframe);
+              is_forward_keyframe, output_frame);
 
   // Set showable frame.
   if (is_forward_keyframe == 0 && update_type != KF_UPDATE) {

diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 3b95637..bc9ff5c 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h

@@ -89,6 +89,12 @@
    * Number of frames in the frame buffer.
    */
   int num_frames;
+
+  /*!
+   * Output filtered frame
+   */
+  YV12_BUFFER_CONFIG *output_frame;
+
   /*!
    * Index of the frame to be filtered.
    */
@@ -207,21 +213,22 @@
  * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
  *
  * \ingroup src_frame_proc
- * \param[in]   cpi                        Top level encoder instance structure
- * \param[in]   filter_frame_lookahead_idx The index of the to-filter frame in
- *                                         the lookahead buffer cpi->lookahead.
- * \param[in]   update_type                This frame's update type.
- * \param[in]   is_forward_keyframe        Indicate whether this is a forward
- *                                         keyframe.
- * \param[in,out]   show_existing_arf      Whether to show existing ARF. This
- *                                         field is updated in this function.
+ * \param[in]      cpi                        Top level encoder instance
+ * structure \param[in]      filter_frame_lookahead_idx The index of the
+ * to-filter frame in the lookahead buffer cpi->lookahead. \param[in]
+ * update_type                This frame's update type. \param[in]
+ * is_forward_keyframe        Indicate whether this is a forward keyframe.
+ * \param[in,out]  show_existing_arf          Whether to show existing ARF. This
+ *                                            field is updated in this function.
+ * \param[out]     output_frame               Ouput filtered frame.
  *
  * \return Whether temporal filtering is successfully done.
  */
 int av1_temporal_filter(struct AV1_COMP *cpi,
                         const int filter_frame_lookahead_idx,
                         FRAME_UPDATE_TYPE update_type, int is_forward_keyframe,
-                        int *show_existing_arf);
+                        int *show_existing_arf,
+                        YV12_BUFFER_CONFIG *output_frame);
 
 /*!\cond */
 // Helper function to get `q` used for encoding.

diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 70817d6..468dfe6 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c

@@ -314,7 +314,8 @@
     tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
     const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
-    int mi_row, int mi_col, int use_y_only_rate_distortion) {
+    int mi_row, int mi_col, int use_y_only_rate_distortion,
+    TplTxfmStats *tpl_txfm_stats) {
   const SequenceHeader *seq_params = cm->seq_params;
   *rate_cost = 0;
   *recon_error = 1;
@@ -401,6 +402,11 @@
         block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
         &this_rate, &this_recon_error, &sse);
 
+    if (plane == 0 && tpl_txfm_stats) {
+      // We only collect Y plane's transform coefficient
+      av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+    }
+
     *recon_error += this_recon_error;
     *pred_error += sse;
     *rate_cost += this_rate;
@@ -763,7 +769,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   }
 
@@ -790,9 +796,7 @@
   get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                       qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                       rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                      use_y_only_rate_distortion);
-
-  av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+                      use_y_only_rate_distortion, tpl_txfm_stats);
 
   tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
   tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
@@ -812,7 +816,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[0] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
 
@@ -833,7 +837,7 @@
     get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
                         qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[1] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
 
@@ -1960,25 +1964,26 @@
 int av1_q_mode_estimate_base_q(const GF_GROUP *gf_group,
                                const TplTxfmStats *txfm_stats_list,
                                const int *stats_valid_list, double bit_budget,
-                               int gf_frame_index, double arf_qstep_ratio,
-                               aom_bit_depth_t bit_depth, double scale_factor,
+                               int gf_frame_index, aom_bit_depth_t bit_depth,
+                               double scale_factor,
+                               const double *qstep_ratio_list,
                                int *q_index_list,
                                double *estimated_bitrate_byframe) {
   int q_max = 255;  // Maximum q value.
   int q_min = 0;    // Minimum q value.
   int q = (q_max + q_min) / 2;
 
-  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_max, arf_qstep_ratio,
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_max, qstep_ratio_list,
                                    bit_depth, gf_group, q_index_list);
   double q_max_estimate = av1_estimate_gop_bitrate(
       q_index_list, gf_group->size, txfm_stats_list, stats_valid_list, NULL);
-  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_min, arf_qstep_ratio,
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_min, qstep_ratio_list,
                                    bit_depth, gf_group, q_index_list);
   double q_min_estimate = av1_estimate_gop_bitrate(
       q_index_list, gf_group->size, txfm_stats_list, stats_valid_list, NULL);
 
   while (true) {
-    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, arf_qstep_ratio,
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
                                      bit_depth, gf_group, q_index_list);
 
     double estimate = av1_estimate_gop_bitrate(
@@ -2009,7 +2014,7 @@
   }
 
   // Update q_index_list and vbr_rc_info.
-  av1_q_mode_compute_gop_q_indices(gf_frame_index, q, arf_qstep_ratio,
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
                                    bit_depth, gf_group, q_index_list);
   av1_estimate_gop_bitrate(q_index_list, gf_group->size, txfm_stats_list,
                            stats_valid_list, estimated_bitrate_byframe);
@@ -2017,6 +2022,10 @@
 }
 
 double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+  if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+    return 1;
+  }
+
   const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
   const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
 
@@ -2069,9 +2078,11 @@
   if (gf_frame_index == 0) {
     vbr_rc_info->q_index_list_ready = 1;
     double gop_bit_budget = vbr_rc_info->gop_bit_budget;
-    // Use the gop_bit_budget to determine q_index_list.
-    const double arf_qstep_ratio =
-        av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+
+    for (int i = gf_frame_index; i < gf_group->size; i++) {
+      vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+    }
+
     // We update the q indices in vbr_rc_info in vbr_rc_info->q_index_list
     // rather than gf_group->q_val to avoid conflicts with the existing code.
     int stats_valid_list[MAX_LENGTH_TPL_FRAME_STATS] = { 0 };
@@ -2087,10 +2098,17 @@
     double scale_factor =
         vbr_rc_info->scale_factors[gf_group->update_type[gf_frame_index]];
 
-    av1_q_mode_estimate_base_q(
+    vbr_rc_info->base_q_index = av1_q_mode_estimate_base_q(
         gf_group, tpl_data->txfm_stats_list, stats_valid_list, gop_bit_budget,
-        gf_frame_index, arf_qstep_ratio, bit_depth, scale_factor,
+        gf_frame_index, bit_depth, scale_factor, vbr_rc_info->qstep_ratio_list,
         vbr_rc_info->q_index_list, vbr_rc_info->estimated_bitrate_byframe);
+  } else if (gf_frame_index == 1) {
+    for (int i = gf_frame_index; i < gf_group->size; i++) {
+      vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+    }
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, vbr_rc_info->base_q_index,
+                                     vbr_rc_info->qstep_ratio_list, bit_depth,
+                                     gf_group, vbr_rc_info->q_index_list);
   }
 }
 

diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 718d447..1cd6a6d 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h

@@ -245,6 +245,7 @@
                                                 // MV entropy estimation
 
   // === Below this line are GOP related data that will be updated per GOP ===
+  int base_q_index;  // Stores the base q index.
   int q_index_list_ready;
   int q_index_list[MAX_LENGTH_TPL_FRAME_STATS];  // q indices for the current
                                                  // GOP
@@ -254,6 +255,9 @@
   int actual_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
   int actual_mv_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
   int actual_coeff_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+
+  // Array to store qstep_ratio for each frame in a GOP
+  double qstep_ratio_list[MAX_LENGTH_TPL_FRAME_STATS];
 } VBR_RATECTRL_INFO;
 
 static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
@@ -271,10 +275,10 @@
   vbr_rc_info->total_bit_budget = total_bit_budget;
   vbr_rc_info->show_frame_count = show_frame_count;
   vbr_rc_info->keyframe_bitrate = 0;
-  const double scale_factors[FRAME_UPDATE_TYPES] = { 1.2, 1.2, 1.2, 1.2,
-                                                     1.2, 1.2, 1.2 };
-  const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 5.0, 5.0, 5.0, 5.0,
-                                                        5.0, 5.0, 5.0 };
+  const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+                                                     1.10199, 1,       1,
+                                                     0.16393 };
+  const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
   memcpy(vbr_rc_info->scale_factors, scale_factors,
          sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
   memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
@@ -298,7 +302,7 @@
 
 static INLINE void vbr_rc_info_log(const VBR_RATECTRL_INFO *vbr_rc_info,
                                    int gf_frame_index, int gf_group_size,
-                                   int *update_type) {
+                                   FRAME_UPDATE_TYPE *update_type) {
   // Add +2 here because this is the last frame this method is called at.
   if (gf_frame_index + 2 >= gf_group_size) {
     printf(
@@ -550,10 +554,10 @@
  *                                    exists
  * \param[in]       bit_budget        The specified bit budget to achieve
  * \param[in]       gf_frame_index    current frame in the GOP
- * \param[in]       arf_qstep_ratio   ARF q step ratio
  * \param[in]       bit_depth         bit depth
  * \param[in]       scale_factor      Scale factor to improve budget estimation
- * \param[out]       q_index_list     array of q_index, one per frame
+ * \param[in]       qstep_ratio_list  Stores the qstep_ratio for each frame
+ * \param[out]      q_index_list      array of q_index, one per frame
  * \param[out]      estimated_bitrate_byframe  bits usage per frame in the GOP
  *
  * \return Returns the optimal base q index to use.
@@ -561,8 +565,9 @@
 int av1_q_mode_estimate_base_q(const struct GF_GROUP *gf_group,
                                const TplTxfmStats *txfm_stats_list,
                                const int *stats_valid_list, double bit_budget,
-                               int gf_frame_index, double arf_qstep_ratio,
-                               aom_bit_depth_t bit_depth, double scale_factor,
+                               int gf_frame_index, aom_bit_depth_t bit_depth,
+                               double scale_factor,
+                               const double *qstep_ratio_list,
                                int *q_index_list,
                                double *estimated_bitrate_byframe);
 

diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 0e6e647..a3ac0d9 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c

@@ -49,11 +49,6 @@
   int8_t children[4];
 } RD_RECORD_IDX_NODE;
 
-typedef struct tx_size_rd_info_node {
-  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
-  struct tx_size_rd_info_node *children[4];
-} TXB_RD_INFO_NODE;
-
 // origin_threshold * 128 / 100
 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   {
@@ -87,256 +82,6 @@
                                                      12, 12, 23, 23, 32, 32, 8,
                                                      8,  16, 16, 23, 23 };
 
-static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
-                                const uint32_t hash) {
-  // Linear search through the circular buffer to find matching hash.
-  for (int i = cur_record->index_start - 1; i >= 0; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  int index;
-  // If not found - add new RD info into the buffer and return its index
-  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
-    index = (cur_record->index_start + cur_record->num) %
-            TX_SIZE_RD_RECORD_BUFFER_LEN;
-    cur_record->num++;
-  } else {
-    index = cur_record->index_start;
-    cur_record->index_start =
-        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
-  }
-
-  cur_record->hash_vals[index] = hash;
-  av1_zero(cur_record->tx_rd_info[index]);
-  return index;
-}
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0, 0, 0, 0 } },
-  { 1, { 0, 0, 0, 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0 } },
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
-  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 5, 6 } },
-  { 0, { 7, 8, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 7, 8 } },
-  { 0, { 5, 6, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
-  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
-  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
-  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
-  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
-  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
-  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
-  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
-  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
-  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
-  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
-  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
-  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
-  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
-  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
-  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
-  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
-  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
-  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
-  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
-  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
-  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
-  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
-  { 0, { 1, -1, 2, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
-  NULL,                    // BLOCK_4X4
-  NULL,                    // BLOCK_4X8
-  NULL,                    // BLOCK_8X4
-  rd_record_tree_8x8,      // BLOCK_8X8
-  rd_record_tree_8x16,     // BLOCK_8X16
-  rd_record_tree_16x8,     // BLOCK_16X8
-  rd_record_tree_16x16,    // BLOCK_16X16
-  rd_record_tree_1_2,      // BLOCK_16X32
-  rd_record_tree_2_1,      // BLOCK_32X16
-  rd_record_tree_sqr,      // BLOCK_32X32
-  rd_record_tree_1_2,      // BLOCK_32X64
-  rd_record_tree_2_1,      // BLOCK_64X32
-  rd_record_tree_sqr,      // BLOCK_64X64
-  rd_record_tree_64x128,   // BLOCK_64X128
-  rd_record_tree_128x64,   // BLOCK_128X64
-  rd_record_tree_128x128,  // BLOCK_128X128
-  NULL,                    // BLOCK_4X16
-  NULL,                    // BLOCK_16X4
-  rd_record_tree_1_4,      // BLOCK_8X32
-  rd_record_tree_4_1,      // BLOCK_32X8
-  rd_record_tree_1_4,      // BLOCK_16X64
-  rd_record_tree_4_1,      // BLOCK_64X16
-};
-
-static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
-  0,                                                            // BLOCK_4X4
-  0,                                                            // BLOCK_4X8
-  0,                                                            // BLOCK_8X4
-  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
-  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
-  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
-  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
-  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
-  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
-  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
-  0,                                                            // BLOCK_4X16
-  0,                                                            // BLOCK_16X4
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
-};
-
-static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
-                                       BLOCK_SIZE bsize) {
-  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
-  const int size = rd_record_tree_size[bsize];
-  for (int i = 0; i < size; ++i) {
-    if (rd_record[i].leaf) {
-      av1_zero(tree[i].children);
-    } else {
-      for (int j = 0; j < 4; ++j) {
-        const int8_t idx = rd_record[i].children[j];
-        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
-      }
-    }
-  }
-}
-
-// Go through all TX blocks that could be used in TX size search, compute
-// residual hash values for them and find matching RD info that stores previous
-// RD search results for these TX blocks. The idea is to prevent repeated
-// rate/distortion computations that happen because of the combination of
-// partition and TX size search. The resulting RD info records are returned in
-// the form of a quadtree for easier access in actual TX size search.
-static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
-                                   TXB_RD_INFO_NODE *dst_rd_info) {
-  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  TXB_RD_RECORD *rd_records_table[4] = {
-    txfm_info->txb_rd_records->txb_rd_record_8X8,
-    txfm_info->txb_rd_records->txb_rd_record_16X16,
-    txfm_info->txb_rd_records->txb_rd_record_32X32,
-    txfm_info->txb_rd_records->txb_rd_record_64X64
-  };
-  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-
-  // Hashing is performed only for square TX sizes larger than TX_4X4
-  if (max_square_tx_size < TX_8X8) return 0;
-  const int diff_stride = bw;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  init_rd_record_tree(dst_rd_info, bsize);
-  // Coordinates of the top-left corner of current block within the superblock
-  // measured in pixels:
-  const int mi_row = x->e_mbd.mi_row;
-  const int mi_col = x->e_mbd.mi_col;
-  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  int cur_rd_info_idx = 0;
-  int cur_tx_depth = 0;
-  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
-  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
-    const int cur_tx_bw = tx_size_wide[cur_tx_size];
-    const int cur_tx_bh = tx_size_high[cur_tx_size];
-    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
-    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
-    const int tx_size_idx = cur_tx_size - TX_8X8;
-    for (int row = 0; row < bh; row += cur_tx_bh) {
-      for (int col = 0; col < bw; col += cur_tx_bw) {
-        if (cur_tx_bw != cur_tx_bh) {
-          // Use dummy nodes for all rectangular transforms within the
-          // TX size search tree.
-          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
-        } else {
-          // Get spatial location of this TX block within the superblock
-          // (measured in cur_tx_bsize units).
-          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
-          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
-
-          int16_t hash_data[MAX_SB_SQUARE];
-          int16_t *cur_hash_row = hash_data;
-          const int16_t *cur_diff_row = diff + row * diff_stride + col;
-          for (int i = 0; i < cur_tx_bh; i++) {
-            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
-            cur_hash_row += cur_tx_bw;
-            cur_diff_row += diff_stride;
-          }
-          const int hash = av1_get_crc32c_value(
-              &txfm_info->txb_rd_records->mb_rd_record.crc_calculator,
-              (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh);
-          // Find corresponding RD info based on the hash value.
-          const int record_idx =
-              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
-          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
-          int idx = find_tx_size_rd_info(records, hash);
-          dst_rd_info[cur_rd_info_idx].rd_info_array =
-              &records->tx_rd_info[idx];
-        }
-        ++cur_rd_info_idx;
-      }
-    }
-    cur_tx_size = next_tx_size;
-    ++cur_tx_depth;
-  }
-  return 1;
-}
-
 static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
@@ -595,8 +340,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node);
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
 
 // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
 // 0: Do not collect any RD stats
@@ -1277,66 +1021,10 @@
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
 
-static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
-                                   int blk_col, BLOCK_SIZE plane_bsize,
-                                   TX_SIZE tx_size) {
-  int16_t tmp_data[64 * 64];
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
-  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  const int txb_w = tx_size_wide[tx_size];
-  const int txb_h = tx_size_high[tx_size];
-  uint8_t *hash_data = (uint8_t *)cur_diff_row;
-  if (txb_w != diff_stride) {
-    int16_t *cur_hash_row = tmp_data;
-    for (int i = 0; i < txb_h; i++) {
-      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
-      cur_hash_row += txb_w;
-      cur_diff_row += diff_stride;
-    }
-    hash_data = (uint8_t *)tmp_data;
-  }
-  CRC32C *crc =
-      &x->txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator;
-  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
-  return (hash << 5) + tx_size;
-}
-
 // pruning thresholds for prune_txk_type and prune_txk_type_separ
 static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
 static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
 
-static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x,
-                                      int plane, int blk_row, int blk_col,
-                                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                      const TXB_CTX *const txb_ctx,
-                                      TXB_RD_INFO **intra_txb_rd_info,
-                                      const int tx_type_map_idx,
-                                      uint16_t *cur_joint_ctx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  assert(cpi->sf.tx_sf.use_intra_txb_hash &&
-         frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
-         plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
-  const uint32_t intra_hash =
-      get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-  const int intra_hash_idx = find_tx_size_rd_info(
-      &txfm_info->txb_rd_records->txb_rd_record_intra, intra_hash);
-  *intra_txb_rd_info = &txfm_info->txb_rd_records->txb_rd_record_intra
-                            .tx_rd_info[intra_hash_idx];
-  *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
-      txfm_info->txb_rd_records->txb_rd_record_intra.tx_rd_info[intra_hash_idx]
-          .valid) {
-    xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    return (ref_tx_type == (*intra_txb_rd_info)->tx_type);
-  }
-  return 0;
-}
-
 // R-D costs are sorted in ascending order.
 static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   int i, j, k;
@@ -1398,6 +1086,7 @@
                               int reduced_tx_set_used, int64_t ref_best_rd,
                               int num_sel) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
 
   int idx;
 
@@ -1439,6 +1128,9 @@
     tx_type = idx_map[idx];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
@@ -1469,6 +1161,9 @@
     tx_type = idx_map_v[idx_v[idx] * 4];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
@@ -1529,6 +1224,7 @@
                         uint16_t allowed_tx_mask, int prune_factor,
                         const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
   int tx_type;
 
   int64_t rds[TX_TYPES];
@@ -1553,6 +1249,9 @@
     }
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     // do txfm and quantization
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
@@ -2283,45 +1982,6 @@
   skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
                                    DRY_RUN_NORMAL);
 
-  // Hashing based speed feature for intra block. If the hash of the residue
-  // is found in the hash table, use the previous RD search results stored in
-  // the table and terminate early.
-  TXB_RD_INFO *intra_txb_rd_info = NULL;
-  uint16_t cur_joint_ctx = 0;
-  const int is_inter = is_inter_block(mbmi);
-  const int use_intra_txb_hash =
-      cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
-      !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
-  if (use_intra_txb_hash) {
-    const int mi_row = xd->mi_row;
-    const int mi_col = xd->mi_col;
-    const int within_border =
-        mi_row >= xd->tile.mi_row_start &&
-        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-        mi_col >= xd->tile.mi_col_start &&
-        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-    if (within_border &&
-        is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize,
-                            tx_size, txb_ctx, &intra_txb_rd_info,
-                            tx_type_map_idx, &cur_joint_ctx)) {
-      best_rd_stats->rate = intra_txb_rd_info->rate;
-      best_rd_stats->dist = intra_txb_rd_info->dist;
-      best_rd_stats->sse = intra_txb_rd_info->sse;
-      best_rd_stats->skip_txfm = intra_txb_rd_info->eob == 0;
-      x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
-      x->plane[plane].txb_entropy_ctx[block] =
-          intra_txb_rd_info->txb_entropy_ctx;
-      best_eob = intra_txb_rd_info->eob;
-      best_tx_type = intra_txb_rd_info->tx_type;
-      skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt;
-      update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
-      recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
-      p->dqcoeff = orig_dqcoeff;
-      return;
-    }
-  }
-
   uint8_t best_txb_ctx = 0;
   // txk_allowed = TX_TYPES: >1 tx types are allowed
   // txk_allowed < TX_TYPES: only that specific tx type is allowed.
@@ -2595,18 +2255,6 @@
     best_rd_stats->sse = block_sse;
   }
 
-  if (intra_txb_rd_info != NULL) {
-    intra_txb_rd_info->valid = 1;
-    intra_txb_rd_info->entropy_context = cur_joint_ctx;
-    intra_txb_rd_info->rate = best_rd_stats->rate;
-    intra_txb_rd_info->dist = best_rd_stats->dist;
-    intra_txb_rd_info->sse = best_rd_stats->sse;
-    intra_txb_rd_info->eob = best_eob;
-    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
-    intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt;
-    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
-  }
-
   // Intra mode needs decoded pixels such that the next transform block
   // can use them for prediction.
   recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -2621,51 +2269,13 @@
                                   int block, int plane_bsize, TXB_CTX *txb_ctx,
                                   RD_STATS *rd_stats,
                                   FAST_TX_SEARCH_MODE ftxs_mode,
-                                  int64_t ref_rdcost,
-                                  TXB_RD_INFO *rd_info_array) {
-  const struct macroblock_plane *const p = &x->plane[0];
-  const uint16_t cur_joint_ctx =
-      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  MACROBLOCKD *xd = &x->e_mbd;
-  assert(is_inter_block(xd->mi[0]));
-  const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col;
-  // Look up RD and terminate early in case when we've already processed exactly
-  // the same residue with exactly the same entropy context.
-  if (rd_info_array != NULL && rd_info_array->valid &&
-      rd_info_array->entropy_context == cur_joint_ctx) {
-    xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    if (ref_tx_type == rd_info_array->tx_type) {
-      rd_stats->rate += rd_info_array->rate;
-      rd_stats->dist += rd_info_array->dist;
-      rd_stats->sse += rd_info_array->sse;
-      rd_stats->skip_txfm &= rd_info_array->eob == 0;
-      p->eobs[block] = rd_info_array->eob;
-      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
-      return;
-    }
-  }
-
+                                  int64_t ref_rdcost) {
   RD_STATS this_rd_stats;
   const int skip_trellis = 0;
   search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
                  txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
-
-  // Save RD results for possible reuse in future.
-  if (rd_info_array != NULL) {
-    rd_info_array->valid = 1;
-    rd_info_array->entropy_context = cur_joint_ctx;
-    rd_info_array->rate = this_rd_stats.rate;
-    rd_info_array->dist = this_rd_stats.dist;
-    rd_info_array->sse = this_rd_stats.sse;
-    rd_info_array->eob = p->eobs[block];
-    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
-    rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx];
-  }
 }
 
 static AOM_INLINE void try_tx_block_no_split(
@@ -2673,8 +2283,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
     const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
     int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    TxCandidateInfo *no_split) {
+    FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -2690,8 +2299,7 @@
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
   mbmi->inter_tx_size[index] = tx_size;
   tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-             rd_stats, ftxs_mode, ref_best_rd,
-             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+             rd_stats, ftxs_mode, ref_best_rd);
   assert(rd_stats->rate < INT_MAX);
 
   const int pick_skip_txfm =
@@ -2726,8 +2334,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    RD_STATS *split_rd_stats) {
+    FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
   assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
@@ -2755,11 +2362,10 @@
 
       RD_STATS this_rd_stats;
       int this_cost_valid = 1;
-      select_tx_block(
-          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
-          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
-          ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode,
-          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+      select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+                      plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+                      no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+                      &this_cost_valid, ftxs_mode);
       if (!this_cost_valid) {
         split_rd_stats->rdcost = INT64_MAX;
         return;
@@ -2874,8 +2480,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node) {
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
   assert(tx_size < TX_SIZES_ALL);
   av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
@@ -2914,7 +2519,7 @@
   if (try_no_split) {
     try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                           plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
-                          ftxs_mode, rd_info_node, &no_split);
+                          ftxs_mode, &no_split);
 
     // Speed features for early termination.
     const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
@@ -2950,7 +2555,7 @@
     try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                        plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
                        AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
-                       rd_info_node, &split_rd_stats);
+                       &split_rd_stats);
   }
 
   if (no_split.rd < split_rd_stats.rdcost) {
@@ -3457,7 +3062,7 @@
             .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-               rd_stats, ftxs_mode, ref_best_rd, NULL);
+               rd_stats, ftxs_mode, ref_best_rd);
     const int mi_width = mi_size_wide[plane_bsize];
     TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
@@ -3596,8 +3201,7 @@
 // will be saved in rd_stats. The returned value is the corresponding RD cost.
 static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd,
-                                       TXB_RD_INFO_NODE *rd_info_tree) {
+                                       int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
@@ -3650,7 +3254,7 @@
       // Search for the best transform block size and type for the sub-block.
       select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
                       ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
-                      best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree);
+                      best_rd_sofar, &is_cost_valid, ftxs_mode);
       if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
         return INT64_MAX;
@@ -3660,7 +3264,6 @@
       no_skip_txfm_rd =
           RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
       block += step;
-      if (rd_info_tree != NULL) rd_info_tree += 1;
     }
   }
 
@@ -3771,19 +3374,8 @@
   ++x->txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
-  // Pre-compute residue hashes (transform block level) and find existing or
-  // add new RD records to store and reuse rate and distortion values to speed
-  // up TX size/type search.
-  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
-  int found_rd_info = 0;
-  if (ref_best_rd != INT64_MAX && within_border &&
-      cpi->sf.tx_sf.use_inter_txb_hash) {
-    found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info);
-  }
-
   const int64_t rd =
-      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd,
-                              found_rd_info ? matched_rd_info : NULL);
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
 
   if (rd == INT64_MAX) {
     // We should always find at least one candidate unless ref_best_rd is less

diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index 5497c7e..b533894 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c

@@ -15,6 +15,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
@@ -187,3 +188,102 @@
     *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+                               const int16_t *coeff_ptr, intptr_t n_coeffs,
+                               int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                               const __m128i *round0, const __m128i *round1,
+                               const __m128i *quant0, const __m128i *quant1,
+                               const __m128i *dequant0, const __m128i *dequant1,
+                               __m128i *eob) {
+  const int16_t *read = coeff_ptr + n_coeffs;
+  __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+  __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+  const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+  const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+  // Reinsert signs
+  qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+  qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  int16_t *addr = qcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, qcoeff0);
+  _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+  coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+  coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+  addr = dqcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, coeff0);
+  _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+  const __m128i zero = _mm_setzero_si128();
+  // Scan for eob
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+  const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+  const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+  const __m128i iscan0 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+  const __m128i iscan1 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+  // Add one to convert from indices to counts
+  const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+  const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+  const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+  const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+  const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+  *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  __m128i eob = _mm_setzero_si128();
+
+  // DC and first 15 AC
+  quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+              &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+                &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  *eob_ptr = accumulate_eob(eob);
+}

diff --git a/av1/encoder/x86/error_intrin_sse2.c b/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000..e876db1
--- /dev/null
+++ b/av1/encoder/x86/error_intrin_sse2.c

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+  __m128i reg_hi = _mm_srli_si128(reg, 8);
+  reg = _mm_add_epi64(reg, reg_hi);
+
+  return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  assert(block_size % 16 == 0);
+  assert(block_size >= 16);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum_0 = zero;
+  __m128i accum_1 = zero;
+
+  for (int i = 0; i < block_size; i += 16) {
+    // Load 8 elements for coeff and dqcoeff.
+    const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+    const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+    // Compute the diff
+    const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+    // Compute the error
+    const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+    const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+    const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+    const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+    const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+    const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+    // Accumulate
+    accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+    accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+    accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+    accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+    // Advance
+    coeff += 16;
+    dqcoeff += 16;
+  }
+
+  __m128i accum = _mm_add_epi64(accum_0, accum_1);
+  // Reduce sum the register
+  accum = reduce_sum_epi64(accum);
+
+  // Store the results.
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(accum);
+#else
+  int64_t result;
+  _mm_storel_epi64((__m128i *)&result, accum);
+  return result;
+#endif  // ARCH_X86_64
+}

diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index a5e1b7b..fde8a45 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake

@@ -73,11 +73,9 @@
 set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
 set_aom_config_var(CONFIG_FRAME_PARALLEL_ENCODE 0
                    "Enable frame parallelism during encode.")
-if(CONFIG_FRAME_PARALLEL_ENCODE)
-  set_aom_config_var(
-    CONFIG_FRAME_PARALLEL_ENCODE_2 0
-    "Enable frame parallelism during encode for frames in lower layer depths.")
-endif() # CONFIG_FRAME_PARALLEL_ENCODE
+set_aom_config_var(
+  CONFIG_FRAME_PARALLEL_ENCODE_2 0
+  "Enable frame parallelism during encode for frames in lower layer depths.")
 set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
 set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
 set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")

diff --git a/examples/svc_encoder_rtc.c b/examples/svc_encoder_rtc.c
index 1516b58..e2da3ae 100644
--- a/examples/svc_encoder_rtc.c
+++ b/examples/svc_encoder_rtc.c

@@ -560,7 +560,7 @@
     int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id,
     aom_svc_ref_frame_config_t *ref_frame_config,
     aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control,
-    int spatial_layer_id, int is_key_frame, int ksvc_mode) {
+    int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) {
   int i;
   int enable_longterm_temporal_ref = 1;
   int shift = (layering_mode == 8) ? 2 : 0;
@@ -646,8 +646,12 @@
 
       // Keep golden fixed at slot 3.
       ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
-      // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
-      lag_index = 4 + (base_count % 4);
+      // Cyclically refresh slots 5, 6, 7, for lag altref.
+      lag_index = 5;
+      if (base_count > 0) {
+        lag_index = 5 + (base_count % 3);
+        if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3);
+      }
       // Set the altref slot to lag_index.
       ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
       if (superframe_cnt % 4 == 0) {
@@ -681,6 +685,8 @@
       // Every frame can reference GOLDEN AND ALTREF.
       ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+      // Allow for compound prediction using LAST and ALTREF.
+      if (speed >= 7) ref_frame_comp_pred->use_comp_pred[2] = 1;
       break;
     case 4:
       // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
@@ -1307,7 +1313,7 @@
         set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
                           &ref_frame_config, &ref_frame_comp_pred,
                           &use_svc_control, slx, is_key_frame,
-                          (app_input.layering_mode == 10));
+                          (app_input.layering_mode == 10), app_input.speed);
         aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
         if (use_svc_control) {
           aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,

diff --git a/test/avg_test.cc b/test/avg_test.cc
index 4c6d9b5..92d5c59 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc

@@ -9,6 +9,7 @@
  */
 
 #include <stdlib.h>
+#include <ostream>
 #include <tuple>
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -521,22 +522,39 @@
 #endif
 
 typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
-class SatdTest : public ::testing::Test,
-                 public ::testing::WithParamInterface<SatdTestParam> {
- protected:
-  virtual void SetUp() {
-    satd_size_ = GET_PARAM(0);
-    satd_func_ref_ = GET_PARAM(1);
-    satd_func_simd_ = GET_PARAM(2);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
 
+template <typename SatdFuncType>
+struct SatdTestParam {
+  SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+      : satd_size(s), func_ref(f1), func_simd(f2) {}
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const SatdTestParam<SatdFuncType> &param) {
+    return os << "satd_size: " << param.satd_size;
+  }
+  int satd_size;
+  SatdFuncType func_ref;
+  SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+  explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+    satd_size_ = func_param.satd_size;
+    satd_func_ref_ = func_param.func_ref;
+    satd_func_simd_ = func_param.func_simd;
+  }
+  virtual void SetUp() {
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<tran_low_t *>(
+    src_ = reinterpret_cast<CoeffType *>(
         aom_memalign(32, sizeof(*src_) * satd_size_));
     ASSERT_TRUE(src_ != NULL);
   }
   virtual void TearDown() { aom_free(src_); }
-  void FillConstant(const tran_low_t val) {
+  void FillConstant(const CoeffType val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
   void FillRandom() {
@@ -597,12 +615,17 @@
   int satd_size_;
 
  private:
-  tran_low_t *src_;
-  SatdFunc satd_func_ref_;
-  SatdFunc satd_func_simd_;
+  CoeffType *src_;
+  SatdFuncType satd_func_ref_;
+  SatdFuncType satd_func_simd_;
   ACMRandom rnd_;
 };
 
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+  SatdTest() : SatdTestBase(GetParam()) {}
+};
+
 TEST_P(SatdTest, MinValue) {
   const int kMin = -32640;
   const int expected = -kMin * satd_size_;
@@ -639,13 +662,21 @@
 }
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
 
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, VectorVarTest,
     ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
@@ -654,4 +685,104 @@
                       make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+  SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
+#endif
+
 }  // namespace

diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index 3ca4340..e4befd5 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc

@@ -31,16 +31,20 @@
 namespace {
 const int kNumIterations = 1000;
 
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+using ErrorBlockFunc = int64_t (*)(const tran_low_t *coeff,
+                                   const tran_low_t *dqcoeff,
+                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
-                                       const tran_low_t *dqcoeff,
-                                       intptr_t block_size, int64_t *ssz);
+using ErrorBlockFunc8Bits = int64_t (*)(const tran_low_t *coeff,
+                                        const tran_low_t *dqcoeff,
+                                        intptr_t block_size, int64_t *ssz);
 
-typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
-    ErrorBlockParam;
+using ErrorBlockLpFunc = int64_t (*)(const int16_t *coeff,
+                                     const int16_t *dqcoeff,
+                                     intptr_t block_size);
+
+using ErrorBlockParam =
+    std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>;
 
 template <ErrorBlockFunc8Bits fn>
 int64_t BlockError8BitWrapper(const tran_low_t *coeff,
@@ -50,6 +54,15 @@
   return fn(coeff, dqcoeff, block_size, ssz);
 }
 
+template <ErrorBlockLpFunc fn>
+int64_t BlockErrorLpWrapper(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
+  *ssz = -1;
+  return fn(reinterpret_cast<const int16_t *>(coeff),
+            reinterpret_cast<const int16_t *>(dqcoeff), block_size);
+}
+
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
   virtual ~ErrorBlockTest() {}
@@ -246,7 +259,9 @@
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sse2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
@@ -264,7 +279,9 @@
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_avx2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
@@ -280,10 +297,14 @@
 #endif  // HAVE_MSA
 
 #if (HAVE_NEON)
-INSTANTIATE_TEST_SUITE_P(
-    NEON, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
+const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+  make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsNeon));
 #endif  // HAVE_NEON
 }  // namespace

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 0141125..8813f33 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc

@@ -23,16 +23,20 @@
 
 using libaom_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
-                             tran_low_t *b);
+using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                              tran_low_t *b);
+// Low precision version of Hadamard Transform
+using HadamardLPFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                                int16_t *b);
 
-void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
-  tran_low_t b[8];
+template <typename OutputType>
+void HadamardLoop(const OutputType *a, OutputType *out) {
+  OutputType b[8];
   for (int i = 0; i < 8; i += 2) {
     b[i + 0] = a[i * 8] + a[(i + 1) * 8];
     b[i + 1] = a[i * 8] - a[(i + 1) * 8];
   }
-  tran_low_t c[8];
+  OutputType c[8];
   for (int i = 0; i < 8; i += 4) {
     c[i + 0] = b[i + 0] + b[i + 2];
     c[i + 1] = b[i + 1] + b[i + 3];
@@ -49,19 +53,21 @@
   out[5] = c[3] - c[7];
 }
 
-void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
-  tran_low_t input[64];
-  tran_low_t buf[64];
+template <typename OutputType>
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, OutputType *b) {
+  OutputType input[64];
+  OutputType buf[64];
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
-      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+      input[i * 8 + j] = static_cast<OutputType>(a[i * a_stride + j]);
     }
   }
   for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
   for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
 }
 
-void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -72,16 +78,16 @@
   /* Overlay the 8x8 blocks and combine. */
   for (int i = 0; i < 64; ++i) {
     /* 8x8 steps the range up to 15 bits. */
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[64];
-    const tran_low_t a2 = b[128];
-    const tran_low_t a3 = b[192];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[64];
+    const OutputType a2 = b[128];
+    const OutputType a3 = b[192];
 
     /* Prevent the result from escaping int16_t. */
-    const tran_low_t b0 = (a0 + a1) >> 1;
-    const tran_low_t b1 = (a0 - a1) >> 1;
-    const tran_low_t b2 = (a2 + a3) >> 1;
-    const tran_low_t b3 = (a2 - a3) >> 1;
+    const OutputType b0 = (a0 + a1) >> 1;
+    const OutputType b1 = (a0 - a1) >> 1;
+    const OutputType b2 = (a2 + a3) >> 1;
+    const OutputType b3 = (a2 - a3) >> 1;
 
     /* Store a 16 bit value. */
     b[0] = b0 + b2;
@@ -93,22 +99,23 @@
   }
 }
 
-void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b) {
   ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
   ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
   ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
   ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
 
   for (int i = 0; i < 256; ++i) {
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[256];
-    const tran_low_t a2 = b[512];
-    const tran_low_t a3 = b[768];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[256];
+    const OutputType a2 = b[512];
+    const OutputType a3 = b[768];
 
-    const tran_low_t b0 = (a0 + a1) >> 2;
-    const tran_low_t b1 = (a0 - a1) >> 2;
-    const tran_low_t b2 = (a2 + a3) >> 2;
-    const tran_low_t b3 = (a2 - a3) >> 2;
+    const OutputType b0 = (a0 + a1) >> 2;
+    const OutputType b1 = (a0 - a1) >> 2;
+    const OutputType b2 = (a2 + a3) >> 2;
+    const OutputType b3 = (a2 - a3) >> 2;
 
     b[0] = b0 + b2;
     b[256] = b1 + b3;
@@ -119,45 +126,57 @@
   }
 }
 
-struct HadamardFuncWithSize {
-  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
-  HadamardFunc func;
+template <typename OutputType>
+void ReferenceHadamard(const int16_t *a, int a_stride, OutputType *b, int bwh) {
+  if (bwh == 32)
+    ReferenceHadamard32x32(a, a_stride, b);
+  else if (bwh == 16)
+    ReferenceHadamard16x16(a, a_stride, b);
+  else if (bwh == 8) {
+    ReferenceHadamard8x8(a, a_stride, b);
+  } else {
+    GTEST_FAIL() << "Invalid Hadamard transform size " << bwh << std::endl;
+  }
+}
+
+template <typename HadamardFuncType>
+struct FuncWithSize {
+  FuncWithSize(HadamardFuncType f, int s) : func(f), block_size(s) {}
+  HadamardFuncType func;
   int block_size;
 };
 
-std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+using HadamardFuncWithSize = FuncWithSize<HadamardFunc>;
+using HadamardLPFuncWithSize = FuncWithSize<HadamardLPFunc>;
+
+template <typename HadamardFuncType>
+std::ostream &operator<<(std::ostream &os,
+                         const FuncWithSize<HadamardFuncType> &hfs) {
   return os << "block size: " << hfs.block_size;
 }
 
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+template <typename OutputType, typename HadamardFuncType>
+class HadamardTestBase
+    : public ::testing::TestWithParam<FuncWithSize<HadamardFuncType>> {
  public:
-  virtual void SetUp() {
-    h_func_ = GetParam().func;
-    bwh_ = GetParam().block_size;
+  explicit HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param) {
+    h_func_ = func_param.func;
+    bwh_ = func_param.block_size;
     block_size_ = bwh_ * bwh_;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
   virtual int16_t Rand() = 0;
 
-  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
-                         int bwh) {
-    if (bwh == 32)
-      ReferenceHadamard32x32(a, a_stride, b);
-    else if (bwh == 16)
-      ReferenceHadamard16x16(a, a_stride, b);
-    else
-      ReferenceHadamard8x8(a, a_stride, b);
-  }
-
   void CompareReferenceRandom() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     memset(b, 0, sizeof(b));
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     memset(b_ref, 0, sizeof(b_ref));
 
     for (int i = 0; i < block_size_; ++i) a[i] = Rand();
@@ -174,11 +193,11 @@
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     for (int i = 8; i < 64; i += 8) {
       memset(b, 0, sizeof(b));
       memset(b_ref, 0, sizeof(b_ref));
@@ -196,7 +215,7 @@
   void SpeedTest(int times) {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, output[kMaxBlockSize]);
     memset(input, 1, sizeof(input));
     memset(output, 0, sizeof(output));
 
@@ -217,11 +236,12 @@
  private:
   int bwh_;
   int block_size_;
-  HadamardFunc h_func_;
+  HadamardFuncType h_func_;
 };
 
-class HadamardLowbdTest : public HadamardTestBase {
+class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
  public:
+  HadamardLowbdTest() : HadamardTestBase(GetParam()) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
@@ -229,6 +249,8 @@
 
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
+TEST_P(HadamardLowbdTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
 INSTANTIATE_TEST_SUITE_P(
     C, HadamardLowbdTest,
     ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
@@ -257,4 +279,44 @@
                       HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
 #endif  // HAVE_NEON
 
+// Tests for low precision
+class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
+ public:
+  HadamardLowbdLPTest() : HadamardTestBase(GetParam()) {}
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdLPTest, CompareReferenceRandom) {
+  CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLPTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLPTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_c, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_c, 16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_sse2, 16)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_avx2, 16)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_neon, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_neon, 16)));
+#endif  // HAVE_NEON
+
 }  // namespace

diff --git a/test/quantize_lp_func_test.cc b/test/quantize_lp_func_test.cc
index 850ec94..f398623 100644
--- a/test/quantize_lp_func_test.cc
+++ b/test/quantize_lp_func_test.cc

@@ -334,6 +334,20 @@
                          ::testing::ValuesIn(kQParamArrayAVX2));
 #endif
 
+#if HAVE_SSE2
+const QuantizeParam kQParamArraySSE2[] = {
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeLpTest,
+                         ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
 #if HAVE_NEON
 const QuantizeParam kQParamArrayNEON[] = {
   make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_neon,

diff --git a/test/ratectrl_test.cc b/test/ratectrl_test.cc
index 67adf27..4b462e3 100644
--- a/test/ratectrl_test.cc
+++ b/test/ratectrl_test.cc

@@ -41,7 +41,7 @@
 // TODO(angiebird): Move this test to tpl_mode_test.cc
 TEST(RatectrlTest, QModeComputeGOPQIndicesTest) {
   const int base_q_index = 80;
-  const double arf_qstep_ratio = 0.5;
+  double qstep_ratio_list[5] = { 0.5, 1, 1, 1, 0.5 };
   const aom_bit_depth_t bit_depth = AOM_BITS_8;
 
   const int gf_frame_index = 0;
@@ -58,10 +58,10 @@
   }
 
   const int arf_q = av1_get_q_index_from_qstep_ratio(
-      base_q_index, arf_qstep_ratio, bit_depth);
+      base_q_index, qstep_ratio_list[0], bit_depth);
 
   av1_q_mode_compute_gop_q_indices(gf_frame_index, base_q_index,
-                                   arf_qstep_ratio, bit_depth, &gf_group,
+                                   qstep_ratio_list, bit_depth, &gf_group,
                                    gf_group.q_val);
 
   for (int i = 0; i < gf_group.size; i++) {

diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index b0b8ab2..5e360f2 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc

@@ -36,19 +36,22 @@
                            { 6, { { 0, 35.3 }, { 3, 36.2 } } },
                            { 7, { { 0, 34.9 }, { 3, 35.8 } } },
                            { 8, { { 0, 35.0 }, { 3, 35.8 } } },
-                           { 9, { { 0, 34.9 }, { 3, 35.5 } } } } },
+                           { 9, { { 0, 34.9 }, { 3, 35.5 } } },
+                           { 10, { { 0, 34.7 }, { 3, 35.3 } } } } },
                        { "paris_352_288_30.y4m",
                          { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
                            { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
                            { 8, { { 0, 36.0 }, { 3, 36.5 } } },
-                           { 9, { { 0, 35.5 }, { 3, 36.0 } } } } },
+                           { 9, { { 0, 35.5 }, { 3, 36.0 } } },
+                           { 10, { { 0, 35.3 }, { 3, 35.9 } } } } },
                        { "niklas_1280_720_30.y4m",
                          { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
                            { 7, { { 0, 33.6 }, { 3, 33.6 } } },
                            { 8, { { 0, 33.48 }, { 3, 33.48 } } },
-                           { 9, { { 0, 33.4 }, { 3, 33.4 } } } } } };
+                           { 9, { { 0, 33.4 }, { 3, 33.4 } } },
+                           { 10, { { 0, 33.2 }, { 3, 33.2 } } } } } };
 
 typedef struct {
   const char *filename;
@@ -176,13 +179,13 @@
 TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
 
 AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
-                           ::testing::Range(5, 10),
+                           ::testing::Range(5, 11),
                            ::testing::Values<unsigned int>(0, 3),
                            ::testing::Values(1), ::testing::Values(1));
 
 AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTestThreaded,
                            ::testing::ValuesIn(kTestVectors),
-                           ::testing::Range(5, 10),
+                           ::testing::Range(5, 11),
                            ::testing::Values<unsigned int>(0, 3),
                            ::testing::Range(2, 5), ::testing::Range(2, 5));
 }  // namespace

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 19e7de8..243fcc1 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc

@@ -474,7 +474,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Top temporal layers are non_reference, so exlcude them from
@@ -996,7 +996,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1041,7 +1041,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1086,7 +1086,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1132,7 +1132,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.

diff --git a/test/tpl_model_test.cc b/test/tpl_model_test.cc
index 3094c6c..da5a821 100644
--- a/test/tpl_model_test.cc
+++ b/test/tpl_model_test.cc

@@ -235,7 +235,7 @@
  * Helper method to brute-force search for the closest q_index
  * that achieves the specified bit budget.
  */
-int find_gop_q_iterative(double bit_budget, double arf_qstep_ratio,
+int find_gop_q_iterative(double bit_budget, const double *qstep_ratio_list,
                          GF_GROUP gf_group, const int *stats_valid_list,
                          TplTxfmStats *stats_list, int gf_frame_index,
                          aom_bit_depth_t bit_depth) {
@@ -243,7 +243,7 @@
   // Use the result to test against the binary search result.
 
   // Initial estimate when q = 255
-  av1_q_mode_compute_gop_q_indices(gf_frame_index, 255, arf_qstep_ratio,
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, 255, qstep_ratio_list,
                                    bit_depth, &gf_group, gf_group.q_val);
   double curr_estimate = av1_estimate_gop_bitrate(
       gf_group.q_val, gf_group.size, stats_list, stats_valid_list, NULL);
@@ -252,7 +252,7 @@
 
   // Start at q = 254 because we already have an estimate for q = 255.
   for (int q = 254; q >= 0; q--) {
-    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, arf_qstep_ratio,
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
                                      bit_depth, &gf_group, gf_group.q_val);
     curr_estimate = av1_estimate_gop_bitrate(
         gf_group.q_val, gf_group.size, stats_list, stats_valid_list, NULL);
@@ -274,10 +274,14 @@
                                           5, 6, 6, 1, 5, 1, 5, 6, 1, 5, 1, 4 };
   int stats_valid_list[25] = { 0 };
   const int gf_frame_index = 0;
-  const double arf_qstep_ratio = 2;
   const aom_bit_depth_t bit_depth = AOM_BITS_8;
   const double scale_factor = 1.0;
 
+  double qstep_ratio_list[25];
+  for (int i = 0; i < 25; i++) {
+    qstep_ratio_list[i] = 1;
+  }
+
   for (int i = 0; i < gf_group.size; i++) {
     stats_valid_list[i] = 1;
     gf_group.update_type[i] = gf_group_update_types[i];
@@ -297,9 +301,9 @@
     // Binary search method to find the optimal q.
     const int result = av1_q_mode_estimate_base_q(
         &gf_group, stats_list, stats_valid_list, bit_budget, gf_frame_index,
-        arf_qstep_ratio, bit_depth, scale_factor, q_index_list, NULL);
+        bit_depth, scale_factor, qstep_ratio_list, q_index_list, NULL);
     const int test_result = find_gop_q_iterative(
-        bit_budget, arf_qstep_ratio, gf_group, stats_valid_list, stats_list,
+        bit_budget, qstep_ratio_list, gf_group, stats_valid_list, stats_list,
         gf_frame_index, bit_depth);
 
     if (bit_budget == 0) {

diff --git a/tools/gop_bitrate/python/bitrate_accuracy.py b/tools/gop_bitrate/python/bitrate_accuracy.py
index 565794f..2a5da6a 100644
--- a/tools/gop_bitrate/python/bitrate_accuracy.py
+++ b/tools/gop_bitrate/python/bitrate_accuracy.py

@@ -1,40 +1,185 @@
 import numpy as np
 
+# Model A only.
 # Uses least squares regression to find the solution
 # when there is one unknown variable.
-def print_lstsq_solution(A, B):
+def lstsq_solution(A, B):
     A_inv = np.linalg.pinv(A)
     x = np.matmul(A_inv, B)
-    print("least squares solution:", x[0][0])
+    return x[0][0]
 
+# Model B only.
 # Uses the pseudoinverse matrix to find the solution
 # when there are two unknown variables.
-def print_pinv_solution(A, mv, B):
+def pinv_solution(A, mv, B):
     new_A = np.concatenate((A, mv), axis=1)
     new_A_inv = np.linalg.pinv(new_A)
     new_x = np.matmul(new_A_inv, B)
     print("pinv solution:", new_x[0][0], new_x[1][0])
+    return (new_x[0][0], new_x[1][0])
+
+# Model A only.
+# Finds the coefficient to multiply A by to minimize
+# the percentage error between A and B.
+def minimize_percentage_error_model_a(A, B):
+    R = np.divide(A, B)
+    num = 0
+    den = 0
+    best_x = 0
+    best_error = 100
+    for r_i in R:
+        num += r_i
+        den += r_i**2
+    if den == 0:
+        return 0
+    return (num/den)[0]
+
+# Model B only.
+# Finds the coefficients to multiply to the frame bitrate
+# and the motion vector bitrate to minimize the percent error.
+def minimize_percentage_error_model_b(r_e, r_m, r_f):
+    r_ef = np.divide(r_e, r_f)
+    r_mf = np.divide(r_m, r_f)
+    sum_ef = np.sum(r_ef)
+    sum_ef_sq = np.sum(np.square(r_ef))
+    sum_mf = np.sum(r_mf)
+    sum_mf_sq = np.sum(np.square(r_mf))
+    sum_ef_mf = np.sum(np.multiply(r_ef, r_mf))
+    # Divides x by y. If y is zero, returns 0.
+    divide = lambda x, y : 0 if y == 0 else x / y
+    # Set up and solve the matrix equation
+    A = np.array([[1, divide(sum_ef_mf, sum_ef_sq)],[divide(sum_ef_mf, sum_mf_sq), 1]])
+    B = np.array([divide(sum_ef, sum_ef_sq), divide(sum_mf, sum_mf_sq)])
+    A_inv = np.linalg.pinv(A)
+    x = np.matmul(A_inv, B)
+    return x
+
+# Model A only.
+# Calculates the least squares error between A and B
+# using coefficients in X.
+def average_lstsq_error(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error += (b - x*a)**2
+    if n == 0:
+        return None
+    error /= n
+    return error
+
+# Model A only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (abs(x*a-b)/b)*100
+        error += error_i
+    error /= n
+    return error
+
+# Model B only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_b(A, M, B, x):
+    error = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        mv = M[i]
+        b = B[i][0]
+        if b == 0:
+            continue
+        estimate = x[0]*a
+        estimate += x[1]*mv
+        error += abs(estimate - b) / b
+    error *= 100
+    error /= A.shape[0]
+    return error
+
+def average_squared_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (1 - x*(a/b))**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
+
+def average_squared_error_model_b(A, M, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        mv = M[i]
+        if b == 0:
+            continue
+        n += 1
+        error_i = 1 - ((x[0]*a + x[1]*mv)/b)
+        error_i = error_i**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
 
 # Traverses the data and prints out one value for
 # each update type.
 def print_solutions(file_path):
     data = np.genfromtxt(file_path, delimiter="\t")
-
     prev_update = 0
     split_list_indices = list()
     for i, val in enumerate(data):
         if prev_update != val[3]:
             split_list_indices.append(i)
             prev_update = val[3]
-
     split = np.split(data, split_list_indices)
-
     for array in split:
         A, mv, B, update = np.hsplit(array, 4)
+        z = np.where(B == 0)[0]
+        r_e = np.delete(A, z, axis=0)
+        r_m = np.delete(mv, z, axis=0)
+        r_f = np.delete(B, z, axis=0)
+        A = r_e
+        mv = r_m
+        B = r_f
+        all_zeros = not A.any()
+        if all_zeros:
+            continue
         print("update type:", update[0][0])
-        print_lstsq_solution(A, B)
-        print_pinv_solution(A, mv, B)
+        x_ls = lstsq_solution(A, B)
+        x_a = minimize_percentage_error_model_a(A, B)
+        x_b = minimize_percentage_error_model_b(A, mv, B)
+        percent_error_a = average_percent_error_model_a(A, B, x_a)
+        percent_error_b = average_percent_error_model_b(A, mv, B, x_b)[0]
+        baseline_percent_error_a = average_percent_error_model_a(A, B, 1)
+        baseline_percent_error_b = average_percent_error_model_b(A, mv, B, [1, 1])[0]
+
+        squared_error_a = average_squared_error_model_a(A, B, x_a)
+        squared_error_b = average_squared_error_model_b(A, mv, B, x_b)[0]
+        baseline_squared_error_a = average_squared_error_model_a(A, B, 1)
+        baseline_squared_error_b = average_squared_error_model_b(A, mv, B, [1, 1])[0]
+
+        print("model,\tframe_coeff,\tmv_coeff,\terror,\tbaseline_error")
+        print("Model A %_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(percent_error_a) + ",\t" + str(baseline_percent_error_a))
+        print("Model A sq_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(squared_error_a) + ",\t" + str(baseline_squared_error_a))
+        print("Model B %_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(percent_error_b) + ",\t" + str(baseline_percent_error_b))
+        print("Model B sq_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(squared_error_b) + ",\t" + str(baseline_squared_error_b))
         print()
 
 if __name__ == "__main__":
-    print_solutions("data/lowres_64f_target150_data.txt")
+    print_solutions("data2/all_lowres_target_lt600_data.txt")

diff --git a/tools/gop_bitrate/python/bitrate_accuracy_percentage_error.py b/tools/gop_bitrate/python/bitrate_accuracy_percentage_error.py
deleted file mode 100644
index fcde9797..0000000
--- a/tools/gop_bitrate/python/bitrate_accuracy_percentage_error.py
+++ /dev/null

@@ -1,106 +0,0 @@
-import numpy as np
-
-# Finds the coefficient to multiply A by to minimize
-# the percentage error between A and B.
-def minimize_percentage_error_model_a(A, B):
-    z = np.where(B == 0)[0]
-    A = np.delete(A, z, axis=0)
-    B = np.delete(B, z, axis=0)
-    z = np.where(A == 0)[0]
-    A = np.delete(A, z, axis=0)
-    B = np.delete(B, z, axis=0)
-
-    R = np.divide(A, B)
-    num = 0
-    den = 0
-    for r_i in R:
-        num += r_i
-        den += r_i**2
-    if den == 0:
-        x = 0
-    else:
-        x = (num / den)[0]
-    return x
-
-def minimize_percentage_error_model_b(r_e, r_m, r_f):
-    z = np.where(r_f == 0)[0]
-    r_e = np.delete(r_e, z, axis=0)
-    r_m = np.delete(r_m, z, axis=0)
-    r_f = np.delete(r_f, z, axis=0)
-
-    r_ef = np.divide(r_e, r_f)
-    r_mf = np.divide(r_m, r_f)
-    sum_ef = np.sum(r_ef)
-    sum_ef_sq = np.sum(np.square(r_ef))
-    sum_mf = np.sum(r_mf)
-    sum_mf_sq = np.sum(np.square(r_mf))
-    sum_ef_mf = np.sum(np.multiply(r_ef, r_mf))
-
-    # Divides x by y. If y is zero, returns 0.
-    divide = lambda x, y : 0 if y == 0 else x / y
-
-    # Set up and solve the matrix equation
-    A = np.array([[1, divide(sum_ef_mf, sum_ef_sq)],[divide(sum_ef_mf, sum_mf_sq), 1]])
-    B = np.array([divide(sum_ef, sum_ef_sq), divide(sum_mf, sum_mf_sq)])
-    A_inv = np.linalg.pinv(A)
-    x = np.matmul(A_inv, B)
-    return x
-
-# Calculates the average percentage error between A and B.
-def average_error_model_a(A, B, x):
-    error = 0
-    for i, a in enumerate(A):
-        a = a[0]
-        b = B[i][0]
-        if b == 0:
-            continue
-        error += abs(x*a - b) / b
-    error *= 100
-    error /= A.shape[0]
-    return error
-
-def average_error_model_b(A, M, B, x):
-    error = 0
-    for i, a in enumerate(A):
-        a = a[0]
-        mv = M[i]
-        b = B[i][0]
-        if b == 0:
-            continue
-        estimate = x[0]*a
-        estimate += x[1]*mv
-        error += abs(estimate - b) / b
-    error *= 100
-    error /= A.shape[0]
-    return error
-
-# Traverses the data and prints out one value for
-# each update type.
-def print_solutions(file_path):
-    data = np.genfromtxt(file_path, delimiter="\t")
-
-    prev_update = 0
-    split_list_indices = list()
-    for i, val in enumerate(data):
-        if prev_update != val[3]:
-            split_list_indices.append(i)
-            prev_update = val[3]
-
-    split = np.split(data, split_list_indices)
-
-    for array in split:
-        A, mv, B, update = np.hsplit(array, 4)
-        print("update type:", update[0][0])
-        xa = minimize_percentage_error_model_a(A, B)
-        xb = minimize_percentage_error_model_b(A, mv, B)
-        print("Model A coefficients:", xa, " | Model B coefficients:", xb)
-        error_a = average_error_model_a(A, B, xa)
-        error_b = average_error_model_b(A, mv, B, xb)
-        baseline_error_a = average_error_model_a(A, B, 1)
-        baseline_error_b = average_error_model_b(A, mv, B, [1, 1])
-        print("error a:", error_a, " | error b:", error_b)
-        print("baseline error a:", baseline_error_a, "baseline error b:", baseline_error_b)
-        print()
-
-if __name__ == "__main__":
-    print_solutions("data2/lowres_17f_target150_data.txt")
commit	df61427b1bcf6f6beff7d1293895633929af6a76	[log] [tgz]
author	Wan-Teh Chang <wtc@google.com>	Thu Oct 14 09:29:48 2021 -0700
committer	Wan-Teh Chang <wtc@google.com>	Thu Oct 14 09:29:48 2021 -0700
tree	fdca9950ba0c1171fc702326f51e0f18377470e3
parent	af43e2b3aab5d9f6de554698eb62aa1998cbaf95 [diff]
parent	287164de79516c25c8c84fd544f67752c170082a [diff]