Merge tag 'v3.1.3' into HEAD Create the signed tag v3.1.3 BUG=aomedia:3157 Change-Id: I46ac8c062754b5bea617a47ede96d3114f911fa2

commit: e2a712842f3c5c1fc1cd816d6b10bdd731ad5282 [log] [tgz]
author: Wan-Teh Chang <wtc@google.com> Tue Oct 05 12:14:29 2021 -0700
committer: Wan-Teh Chang <wtc@google.com> Tue Oct 05 12:15:40 2021 -0700
tree: fc4e9520c29de16c0bb21b8d24f6d987c5713ef5
parent: d50b2b81ca06be20cd18945be4bc04298b88a7cf [diff]
parent: ce9a40ce01ade9d6fea1721c82645804a2f39b00 [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcf8965..94395b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -23,6 +23,11 @@
 
 project(AOM C CXX)
 
+# GENERATED source property global visibility.
+if(POLICY CMP0118)
+  cmake_policy(SET CMP0118 NEW)
+endif()
+
 if(NOT EMSCRIPTEN)
   if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     set(CMAKE_BUILD_TYPE
@@ -51,7 +56,13 @@
 unset(LT_REVISION)
 unset(LT_AGE)
 
+# Enable generators like Xcode and Visual Studio to place projects in folders.
+set_property(GLOBAL PROPERTY USE_FOLDERS TRUE)
+
 include("${AOM_ROOT}/build/cmake/aom_configure.cmake")
+if(CONFIG_THREE_PASS)
+  include("${AOM_ROOT}/common/ivf_dec.cmake")
+endif()
 include("${AOM_ROOT}/aom_dsp/aom_dsp.cmake")
 include("${AOM_ROOT}/aom_mem/aom_mem.cmake")
 include("${AOM_ROOT}/aom_ports/aom_ports.cmake")
@@ -129,6 +140,7 @@
             "${AOM_ROOT}/aom/aom_codec.h"
             "${AOM_ROOT}/aom/aom_decoder.h"
             "${AOM_ROOT}/aom/aom_encoder.h"
+            "${AOM_ROOT}/aom/aom_external_partition.h"
             "${AOM_ROOT}/aom/aom_frame_buffer.h"
             "${AOM_ROOT}/aom/aom_image.h"
             "${AOM_ROOT}/aom/aom_integer.h"
@@ -143,8 +155,8 @@
             "${AOM_ROOT}/aom/src/aom_integer.c")
 
 list(APPEND AOM_COMMON_APP_UTIL_SOURCES
-            "${AOM_ROOT}/av1/arg_defs.h"
             "${AOM_ROOT}/av1/arg_defs.c"
+            "${AOM_ROOT}/av1/arg_defs.h"
             "${AOM_ROOT}/common/args_helper.c"
             "${AOM_ROOT}/common/args_helper.h"
             "${AOM_ROOT}/common/args.c"
@@ -159,10 +171,11 @@
             "${AOM_ROOT}/common/rawenc.c"
             "${AOM_ROOT}/common/rawenc.h"
             "${AOM_ROOT}/common/y4menc.c"
-            "${AOM_ROOT}/common/y4menc.h")
+            "${AOM_ROOT}/common/y4menc.h"
+            "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h")
 
-list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/ivfdec.c"
-            "${AOM_ROOT}/common/ivfdec.h" "${AOM_ROOT}/common/obudec.c"
+list(APPEND AOM_DECODER_APP_UTIL_SOURCES "${AOM_ROOT}/common/obudec.c"
             "${AOM_ROOT}/common/obudec.h" "${AOM_ROOT}/common/video_reader.c"
             "${AOM_ROOT}/common/video_reader.h")
 
@@ -290,6 +303,9 @@
 endif()
 
 # Setup dependencies.
+if(CONFIG_THREE_PASS)
+  setup_ivf_dec_targets()
+endif()
 setup_aom_dsp_targets()
 setup_aom_mem_targets()
 setup_aom_ports_targets()
@@ -317,19 +333,23 @@
 #
 if(ENABLE_EXAMPLES OR ENABLE_TESTS OR ENABLE_TOOLS)
   add_library(aom_common_app_util OBJECT ${AOM_COMMON_APP_UTIL_SOURCES})
+  set_property(TARGET ${example} PROPERTY FOLDER examples)
   if(CONFIG_AV1_DECODER)
     add_library(aom_decoder_app_util OBJECT ${AOM_DECODER_APP_UTIL_SOURCES})
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
     # obudec depends on internal headers that require *rtcd.h
     add_dependencies(aom_decoder_app_util aom_rtcd)
   endif()
   if(CONFIG_AV1_ENCODER)
     add_library(aom_encoder_app_util OBJECT ${AOM_ENCODER_APP_UTIL_SOURCES})
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
   endif()
 endif()
 
 if((CONFIG_AV1_DECODER OR CONFIG_AV1_ENCODER) AND ENABLE_EXAMPLES)
   add_executable(resize_util "${AOM_ROOT}/examples/resize_util.c"
                              $<TARGET_OBJECTS:aom_common_app_util>)
+  set_property(TARGET ${example} PROPERTY FOLDER examples)
   list(APPEND AOM_APP_TARGETS resize_util)
 endif()
 
@@ -425,6 +445,10 @@
     add_executable(noise_model "${AOM_ROOT}/examples/noise_model.c"
                                $<TARGET_OBJECTS:aom_common_app_util>
                                $<TARGET_OBJECTS:aom_encoder_app_util>)
+    add_executable(photon_noise_table
+                   "${AOM_ROOT}/examples/photon_noise_table.c"
+                   $<TARGET_OBJECTS:aom_common_app_util>
+                   $<TARGET_OBJECTS:aom_encoder_app_util>)
     add_executable(scalable_encoder "${AOM_ROOT}/examples/scalable_encoder.c"
                                     $<TARGET_OBJECTS:aom_common_app_util>
                                     $<TARGET_OBJECTS:aom_encoder_app_util>)
@@ -435,8 +459,8 @@
 
     # Maintain a list of encoder example targets.
     list(APPEND AOM_ENCODER_EXAMPLE_TARGETS aomenc lossless_encoder noise_model
-                set_maps simple_encoder scalable_encoder twopass_encoder
-                svc_encoder_rtc)
+                photon_noise_table set_maps simple_encoder scalable_encoder
+                twopass_encoder svc_encoder_rtc)
   endif()
 
   if(ENABLE_TOOLS)
@@ -715,6 +739,17 @@
 endif()
 
 if(BUILD_SHARED_LIBS)
+  if(NOT WIN32 AND NOT APPLE)
+    # The -z defs linker option reports unresolved symbol references from object
+    # files when building a shared library.
+    if("${CMAKE_VERSION}" VERSION_LESS "3.13")
+      # target_link_options() is not available before CMake 3.13.
+      target_link_libraries(aom PRIVATE -Wl,-z,defs)
+    else()
+      target_link_options(aom PRIVATE LINKER:-z,defs)
+    endif()
+  endif()
+
   include("${AOM_ROOT}/build/cmake/exports.cmake")
   setup_exports_target()
 endif()
@@ -770,12 +805,14 @@
 if(ENABLE_EXAMPLES)
   foreach(example ${AOM_EXAMPLE_TARGETS})
     list(APPEND AOM_DIST_EXAMPLES $<TARGET_FILE:${example}>)
+    set_property(TARGET ${example} PROPERTY FOLDER examples)
   endforeach()
 endif()
 
 if(ENABLE_TOOLS)
   foreach(tool ${AOM_TOOL_TARGETS})
     list(APPEND AOM_DIST_TOOLS $<TARGET_FILE:${tool}>)
+    set_property(TARGET ${tool} PROPERTY FOLDER tools)
   endforeach()
 endif()
 

diff --git a/README.md b/README.md
index 21b114e..118fccc 100644
--- a/README.md
+++ b/README.md

@@ -441,7 +441,9 @@
 
 The fastest and easiest way to obtain the test data is to use CMake to generate
 a build using the Unix Makefiles generator, and then to build only the testdata
-rule:
+rule. By default the test files will be downloaded to the current directory. The
+`LIBAOM_TEST_DATA_PATH` environment variable can be used to set a
+custom one.
 
 ~~~
     $ cmake path/to/aom -G "Unix Makefiles"

diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index a629eef..c324c56 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h

@@ -31,6 +31,7 @@
 #endif
 
 #include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
 
 /*!\brief Current ABI version number
  *
@@ -41,7 +42,7 @@
  * fields to structures
  */
 #define AOM_ENCODER_ABI_VERSION \
-  (9 + AOM_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (10 + AOM_CODEC_ABI_VERSION + AOM_EXT_PART_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -154,11 +155,19 @@
   int den;        /**< fraction denominator */
 } aom_rational_t; /**< alias for struct aom_rational */
 
-/*!\brief Multi-pass Encoding Pass */
+/*!\brief Multi-pass Encoding Pass
+ *
+ * AOM_RC_LAST_PASS is kept for backward compatibility.
+ * If passes is not given and pass==2, the codec will assume passes=2.
+ * For new code, it is recommended to use AOM_RC_SECOND_PASS and set
+ * the "passes" member to 2 via the key & val API for two-pass encoding.
+ */
 enum aom_enc_pass {
-  AOM_RC_ONE_PASS,   /**< Single pass mode */
-  AOM_RC_FIRST_PASS, /**< First pass of multi-pass mode */
-  AOM_RC_LAST_PASS   /**< Final pass of multi-pass mode */
+  AOM_RC_ONE_PASS = 0,    /**< Single pass mode */
+  AOM_RC_FIRST_PASS = 1,  /**< First pass of multi-pass mode */
+  AOM_RC_SECOND_PASS = 2, /**< Second pass of multi-pass mode */
+  AOM_RC_THIRD_PASS = 3,  /**< Third pass of multi-pass mode */
+  AOM_RC_LAST_PASS = 2,   /**< Final pass of two-pass mode */
 };
 
 /*!\brief Rate control mode */
@@ -613,7 +622,7 @@
 
   /*!\brief Target data rate
    *
-   * Target bandwidth to use for this stream, in kilobits per second.
+   * Target bitrate to use for this stream, in kilobits per second.
    */
   unsigned int rc_target_bitrate;
 

diff --git a/aom/aom_external_partition.h b/aom/aom_external_partition.h
new file mode 100644
index 0000000..1bb31c4
--- /dev/null
+++ b/aom/aom_external_partition.h

@@ -0,0 +1,404 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+#define AOM_AOM_AOM_EXTERNAL_PARTITION_H_
+
+/*!\defgroup aom_encoder AOMedia AOM/AV1 Encoder
+ * \ingroup aom
+ *
+ * @{
+ */
+#include <stdint.h>
+
+/*!\file
+ * \brief Provides function pointer definitions for the external partition.
+ */
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures.
+ */
+#define AOM_EXT_PART_ABI_VERSION 3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Abstract external partition model handler
+ */
+typedef void *aom_ext_part_model_t;
+
+/*!\brief Number of features to determine whether to skip partition none and
+ * do partition split directly. The same as "FEATURE_SIZE_SMS_SPLIT".
+ */
+#define AOM_EXT_PART_SIZE_DIRECT_SPLIT 17
+
+/*!\brief Number of features to use simple motion search to prune out
+ * rectangular partition in some direction. The same as
+ * "FEATURE_SIZE_SMS_PRUNE_PART".
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_PART 25
+
+/*!\brief Number of features to prune split and rectangular partition
+ * after PARTITION_NONE.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_NONE 4
+
+/*!\brief Number of features to terminates partition after partition none using
+ * simple_motion_search features and the rate, distortion, and rdcost of
+ * PARTITION_NONE. The same as "FEATURE_SIZE_SMS_TERM_NONE".
+ */
+#define AOM_EXT_PART_SIZE_TERM_NONE 28
+
+/*!\brief Number of features to terminates partition after partition split.
+ */
+#define AOM_EXT_PART_SIZE_TERM_SPLIT 31
+
+/*!\brief Number of features to prune rectangular partition using stats
+ * collected after partition split.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_RECT 9
+
+/*!\brief Number of features to prune AB partition using stats
+ * collected after rectangular partition..
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_AB 10
+
+/*!\brief Number of features to prune 4-way partition using stats
+ * collected after AB partition.
+ */
+#define AOM_EXT_PART_SIZE_PRUNE_4_WAY 18
+
+/*!\brief Decision mode of the external partition model.
+ * AOM_EXT_PART_WHOLE_TREE: the external partition model should provide the
+ * whole partition tree for the superblock.
+ *
+ * AOM_EXT_PART_RECURSIVE: the external partition model provides the partition
+ * decision of the current block only. The decision process starts from
+ * the superblock size, down to the smallest block size (4x4) recursively.
+ */
+typedef enum aom_ext_part_decision_mode {
+  AOM_EXT_PART_WHOLE_TREE = 0,
+  AOM_EXT_PART_RECURSIVE = 1,
+} aom_ext_part_decision_mode_t;
+
+/*!\brief Config information sent to the external partition model.
+ *
+ * For example, the maximum superblock size determined by the sequence header.
+ */
+typedef struct aom_ext_part_config {
+  int superblock_size;  ///< super block size (either 64x64 or 128x128)
+} aom_ext_part_config_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected before NONE partition.
+ * Features "f" are used to determine:
+ * partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
+ * do_rectangular_split, do_square_split
+ * Features "f_part2" are used to determine:
+ * prune_horz, prune_vert.
+ */
+typedef struct aom_partition_features_before_none {
+  /*! features to determine whether skip partition none and do split directly */
+  float f[AOM_EXT_PART_SIZE_DIRECT_SPLIT];
+  /*! features to determine whether to prune rectangular partition */
+  float f_part2[AOM_EXT_PART_SIZE_PRUNE_PART];
+} aom_partition_features_before_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after NONE partition.
+ */
+typedef struct aom_partition_features_none {
+  /*! features to prune split and rectangular partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_NONE];
+  /*! features to determine termination of partition */
+  float f_terminate[AOM_EXT_PART_SIZE_TERM_NONE];
+} aom_partition_features_none_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after SPLIT partition.
+ */
+typedef struct aom_partition_features_split {
+  /*! features to determine termination of  partition */
+  float f_terminate[AOM_EXT_PART_SIZE_TERM_SPLIT];
+  /*! features to determine pruning rect partition */
+  float f_prune_rect[AOM_EXT_PART_SIZE_PRUNE_RECT];
+} aom_partition_features_split_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after RECTANGULAR partition.
+ */
+typedef struct aom_partition_features_rect {
+  /*! features to determine pruning AB partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_AB];
+} aom_partition_features_rect_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ * Specifically, features collected after AB partition: HORZ_A, HORZ_B, VERT_A,
+ * VERT_B.
+ */
+typedef struct aom_partition_features_ab {
+  /*! features to determine pruning 4-way partition */
+  float f[AOM_EXT_PART_SIZE_PRUNE_4_WAY];
+} aom_partition_features_ab_t;
+
+/*!\brief Feature id to tell the external model the current stage in partition
+ * pruning and what features to use to make decisions accordingly.
+ */
+typedef enum {
+  AOM_EXT_PART_FEATURE_BEFORE_NONE,
+  AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_NONE,
+  AOM_EXT_PART_FEATURE_AFTER_NONE_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_SPLIT,
+  AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2,
+  AOM_EXT_PART_FEATURE_AFTER_RECT,
+  AOM_EXT_PART_FEATURE_AFTER_AB
+} AOM_EXT_PART_FEATURE_ID;
+
+/*!\brief Features collected from the tpl process.
+ *
+ * The tpl process collects information that help measure the inter-frame
+ * dependency.
+ * The tpl process is computed in the unit of tpl_bsize_1d (16x16).
+ * Therefore, the max number of units inside a superblock is
+ * 128x128 / (16x16) = 64. Change it if the tpl process changes.
+ */
+typedef struct aom_sb_tpl_features {
+  int available;        ///< If tpl stats are available
+  int tpl_unit_length;  ///< The block length of tpl process
+  int num_units;        ///< The number of units inside the current superblock
+  int64_t intra_cost[64];   ///< The intra cost of each unit
+  int64_t inter_cost[64];   ///< The inter cost of each unit
+  int64_t mc_dep_cost[64];  ///< The motion compensated dependency cost
+} aom_sb_tpl_features_t;
+
+/*!\brief Features collected from the simple motion process.
+ *
+ * The simple motion process collects information by applying motion compensated
+ * prediction on each block.
+ * The block size is 16x16, which could be changed. If it is changed, update
+ * comments and the array size here.
+ */
+typedef struct aom_sb_simple_motion_features {
+  int unit_length;    ///< The block length of the simple motion process
+  int num_units;      ///< The number of units inside the current superblock
+  int block_sse[64];  ///< Sum of squared error of each unit
+  int block_var[64];  ///< Variance of each unit
+} aom_sb_simple_motion_features_t;
+
+/*!\brief Features of each super block.
+ *
+ * Features collected for each super block before partition search.
+ */
+typedef struct aom_sb_features {
+  /*! Features from motion search */
+  aom_sb_simple_motion_features_t motion_features;
+  /*! Features from tpl process */
+  aom_sb_tpl_features_t tpl_features;
+} aom_sb_features_t;
+
+/*!\brief Features pass to the external model to make partition decisions.
+ *
+ * The encoder sends these features to the external model through
+ * "func()" defined in .....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_features {
+  // Features for the current supervised multi-stage ML model.
+  /*! Feature ID to indicate active features */
+  AOM_EXT_PART_FEATURE_ID id;
+  /*! Features collected before NONE partition */
+  aom_partition_features_before_none_t before_part_none;
+  /*! Features collected after NONE partition */
+  aom_partition_features_none_t after_part_none;
+  /*! Features collected after SPLIT partition */
+  aom_partition_features_split_t after_part_split;
+  /*! Features collected after RECTANGULAR partition */
+  aom_partition_features_rect_t after_part_rect;
+  /*! Features collected after AB partition */
+  aom_partition_features_ab_t after_part_ab;
+
+  // Features for a new ML model.
+  aom_sb_features_t sb_features;  ///< Features collected for the super block
+  int mi_row;                     ///< Mi_row position of the block
+  int mi_col;                     ///< Mi_col position of the block
+  int frame_width;                ///< Frame width
+  int frame_height;               ///< Frame height
+  int block_size;                 ///< As "BLOCK_SIZE" in av1/common/enums.h
+} aom_partition_features_t;
+
+/*!\brief Partition decisions received from the external model.
+ *
+ * The encoder receives partition decisions and encodes the superblock
+ * with the given partition type.
+ * The encoder receives it from "func()" define in ....
+ *
+ * NOTE: new member variables may be added to this structure in the future.
+ * Once new features are finalized, bump the major version of libaom.
+ */
+typedef struct aom_partition_decision {
+  // Decisions for directly set partition types
+  int is_final_decision;         ///< The flag whether it's the final decision
+  int num_nodes;                 ///< The number of leaf nodes
+  int partition_decision[2048];  ///< Partition decisions
+  int current_decision;          ///< Partition decision for the current block
+
+  // Decisions for partition type pruning
+  int terminate_partition_search;  ///< Terminate further partition search
+  int partition_none_allowed;      ///< Allow partition none type
+  int partition_rect_allowed[2];   ///< Allow rectangular partitions
+  int do_rectangular_split;        ///< Try rectangular split partition
+  int do_square_split;             ///< Try square split partition
+  int prune_rect_part[2];          ///< Prune rectangular partition
+  int horza_partition_allowed;     ///< Allow HORZ_A partitioin
+  int horzb_partition_allowed;     ///< Allow HORZ_B partitioin
+  int verta_partition_allowed;     ///< Allow VERT_A partitioin
+  int vertb_partition_allowed;     ///< Allow VERT_B partitioin
+  int partition_horz4_allowed;     ///< Allow HORZ4 partition
+  int partition_vert4_allowed;     ///< Allow VERT4 partition
+} aom_partition_decision_t;
+
+/*!\brief Encoding stats for the given partition decision.
+ *
+ * The encoding stats collected by encoding the superblock with the
+ * given partition types.
+ * The encoder sends the stats to the external model for training
+ * or inference though "func()" defined in ....
+ */
+typedef struct aom_partition_stats {
+  int rate;        ///< Rate cost of the block
+  int64_t dist;    ///< Distortion of the block
+  int64_t rdcost;  ///< Rate-distortion cost of the block
+} aom_partition_stats_t;
+
+/*!\brief Enum for return status.
+ */
+typedef enum aom_ext_part_status {
+  AOM_EXT_PART_OK = 0,     ///< Status of success
+  AOM_EXT_PART_ERROR = 1,  ///< Status of failure
+  AOM_EXT_PART_TEST = 2,   ///< Status used for tests
+} aom_ext_part_status_t;
+
+/*!\brief Callback of creating an external partition model.
+ *
+ * The callback is invoked by the encoder to create an external partition
+ * model.
+ *
+ * \param[in] priv Callback's private data
+ * \param[in] part_config Config information pointer for model creation
+ * \param[out] ext_part_model Pointer to the model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_create_model_fn_t)(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model);
+
+/*!\brief Callback of sending features to the external partition model.
+ *
+ * The callback is invoked by the encoder to send features to the external
+ * partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] part_features Pointer to the features
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_features_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features);
+
+/*!\brief Callback of receiving partition decisions from the external
+ * partition model.
+ *
+ * The callback is invoked by the encoder to receive partition decisions from
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_decision Pointer to the partition decisions
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_get_decision_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision);
+
+/*!\brief Callback of sending stats to the external partition model.
+ *
+ * The callback is invoked by the encoder to send encoding stats to
+ * the external partition model.
+ *
+ * \param[in] ext_part_model The external model
+ * \param[in] ext_part_stats Pointer to the encoding stats
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_send_partition_stats_fn_t)(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats);
+
+/*!\brief Callback of deleting the external partition model.
+ *
+ * The callback is invoked by the encoder to delete the external partition
+ * model.
+ *
+ * \param[in] ext_part_model The external model
+ */
+typedef aom_ext_part_status_t (*aom_ext_part_delete_model_fn_t)(
+    aom_ext_part_model_t ext_part_model);
+
+/*!\brief Callback function set for external partition model.
+ *
+ * Uses can enable external partition model by registering a set of
+ * callback functions with the flag: AV1E_SET_EXTERNAL_PARTITION_MODEL
+ */
+typedef struct aom_ext_part_funcs {
+  /*!
+   * Create an external partition model.
+   */
+  aom_ext_part_create_model_fn_t create_model;
+
+  /*!
+   * Send features to the external partition model to make partition decisions.
+   */
+  aom_ext_part_send_features_fn_t send_features;
+
+  /*!
+   * Get partition decisions from the external partition model.
+   */
+  aom_ext_part_get_decision_fn_t get_partition_decision;
+
+  /*!
+   * Send stats of the current partition to the external model.
+   */
+  aom_ext_part_send_partition_stats_fn_t send_partition_stats;
+
+  /*!
+   * Delete the external partition model.
+   */
+  aom_ext_part_delete_model_fn_t delete_model;
+
+  /*!
+   * The decision mode of the model.
+   */
+  aom_ext_part_decision_mode_t decision_mode;
+
+  /*!
+   * Private data for the external partition model.
+   */
+  void *priv;
+} aom_ext_part_funcs_t;
+
+/*!@} - end defgroup aom_encoder*/
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_AOM_EXTERNAL_PARTITION_H_

diff --git a/aom/aom_image.h b/aom/aom_image.h
index 03bc73e..9ec3225 100644
--- a/aom/aom_image.h
+++ b/aom/aom_image.h

@@ -300,7 +300,8 @@
 /*!\brief Set the rectangle identifying the displayed portion of the image
  *
  * Updates the displayed rectangle (aka viewport) on the image surface to
- * match the specified coordinates and size.
+ * match the specified coordinates and size. Specifically, sets img->d_w,
+ * img->d_h, and elements of the img->planes[] array.
  *
  * \param[in]    img       Image descriptor
  * \param[in]    x         leftmost column
@@ -309,7 +310,7 @@
  * \param[in]    h         height
  * \param[in]    border    A border that is padded on four sides of the image.
  *
- * \return 0 if the requested rectangle is valid, nonzero otherwise.
+ * \return 0 if the requested rectangle is valid, nonzero (-1) otherwise.
  */
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h, unsigned int border);

diff --git a/aom/aomcx.h b/aom/aomcx.h
index b4eeeb1..9a2287b 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h

@@ -18,6 +18,7 @@
  */
 #include "aom/aom.h"
 #include "aom/aom_encoder.h"
+#include "aom/aom_external_partition.h"
 
 /*!\file
  * \brief Provides definitions for using AOM or AV1 encoder algorithm within the
@@ -222,10 +223,14 @@
 
   /* NOTE: enum 15 unused */
 
-  /*!\brief Codec control function to set loop filter sharpness,
+  /*!\brief Codec control function to set the sharpness parameter,
    * unsigned int parameter.
    *
-   * Valid range: 0..7. The default is 0.
+   * This parameter controls the level at which rate-distortion optimization of
+   * transform coefficients favours sharpness in the block.
+   *
+   * Valid range: 0..7. The default is 0. Values 1-7 will avoid eob and skip
+   * block optimization and will change rdmult in favour of block sharpness.
    */
   AOME_SET_SHARPNESS = AOME_SET_ENABLEAUTOALTREF + 2,  // 16
 
@@ -1108,7 +1113,9 @@
    *
    * - 0 = deltaq signaling off
    * - 1 = use modulation to maximize objective quality (default)
-   * - 2 = use modulation to maximize perceptual quality
+   * - 2 = use modulation for local test
+   * - 3 = use modulation for key frame perceptual quality optimization
+   * - 4 = use modulation for user rating based perceptual quality optimization
    */
   AV1E_SET_DELTAQ_MODE = 107,
 
@@ -1320,13 +1327,56 @@
   /*!\brief Codec control function to turn on / off D45 to D203 intra mode
    * usage, int parameter
    *
-   * This will enable or disable usage of D45 to D203 intra modes.
+   * This will enable or disable usage of D45 to D203 intra modes, which are a
+   * subset of directional modes. This control has no effect if directional
+   * modes are disabled (AV1E_SET_ENABLE_DIRECTIONAL_INTRA set to 0).
    *
    * - 0 = disable
    * - 1 = enable (default)
    */
   AV1E_SET_ENABLE_DIAGONAL_INTRA = 141,
 
+  /*!\brief Control to set frequency of the cost updates for intrabc motion
+   * vectors, unsigned int parameter
+   *
+   * - 0 = update at SB level (default)
+   * - 1 = update at SB row level in tile
+   * - 2 = update at tile level
+   * - 3 = turn off
+   */
+  AV1E_SET_DV_COST_UPD_FREQ = 142,
+
+  /*!\brief Codec control to set the path for partition stats read and write.
+   * const char * parameter.
+   */
+  AV1E_SET_PARTITION_INFO_PATH = 143,
+
+  /*!\brief Codec control to use an external partition model
+   * A set of callback functions is passed through this control
+   * to let the encoder encode with given partitions.
+   */
+  AV1E_SET_EXTERNAL_PARTITION = 144,
+
+  /*!\brief Codec control function to turn on / off directional intra mode
+   * usage, int parameter
+   *
+   * - 0 = disable
+   * - 1 = enable (default)
+   */
+  AV1E_SET_ENABLE_DIRECTIONAL_INTRA = 145,
+
+  /*!\brief Control to turn on / off transform size search.
+   *
+   * - 0 = disable, transforms always have the largest possible size
+   * - 1 = enable, search for the best transform size for each block (default)
+   */
+  AV1E_SET_ENABLE_TX_SIZE_SEARCH = 146,
+
+  /*!\brief Codec control function to set reference frame compound prediction.
+   * aom_svc_ref_frame_comp_pred_t* parameter
+   */
+  AV1E_SET_SVC_REF_FRAME_COMP_PRED = 147,
+
   // Any new encoder control IDs should be added above.
   // Maximum allowed encoder control ID is 229.
   // No encoder control ID should be added below.
@@ -1461,6 +1511,13 @@
   int refresh[8]; /**< Refresh flag for each of the 8 slots. */
 } aom_svc_ref_frame_config_t;
 
+/*!brief Parameters for setting ref frame compound prediction */
+typedef struct aom_svc_ref_frame_comp_pred {
+  // Use compound prediction for the ref_frame pairs GOLDEN_LAST (0),
+  // LAST2_LAST (1), and ALTREF_LAST (2).
+  int use_comp_pred[3]; /**<Compound reference flag. */
+} aom_svc_ref_frame_comp_pred_t;
+
 /*!\cond */
 /*!\brief Encoder control function parameter type
  *
@@ -1860,6 +1917,25 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DNL_DENOISING, int)
 #define AOM_CTRL_AV1E_SET_ENABLE_DNL_DENOISING
 
+AOM_CTRL_USE_TYPE(AV1E_SET_DV_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_DV_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_PARTITION_INFO_PATH, const char *)
+#define AOM_CTRL_AV1E_SET_PARTITION_INFO_PATH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_EXTERNAL_PARTITION, aom_ext_part_funcs_t *)
+#define AOM_CTRL_AV1E_SET_EXTERNAL_PARTITION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIRECTIONAL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX_SIZE_SEARCH, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX_SIZE_SEARCH
+
+AOM_CTRL_USE_TYPE(AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+                  aom_svc_ref_frame_comp_pred_t *)
+#define AOME_CTRL_AV1E_SET_SVC_REF_FRAME_COMP_PRED
+
 /*!\endcond */
 /*! @} - end defgroup aom_encoder */
 #ifdef __cplusplus

diff --git a/aom/aomdx.h b/aom/aomdx.h
index b3fd90e..33d8fbc 100644
--- a/aom/aomdx.h
+++ b/aom/aomdx.h

@@ -425,6 +425,20 @@
   /*!\brief Codec control function to get the S_FRAME coding information
    */
   AOMD_GET_S_FRAME_INFO,
+
+  /*!\brief Codec control function to get the show frame flag, int parameter
+   */
+  AOMD_GET_SHOW_FRAME_FLAG,
+
+  /*!\brief Codec control function to get the base q index of a frame, int
+   * parameter
+   */
+  AOMD_GET_BASE_Q_IDX,
+
+  /*!\brief Codec control function to get the order hint of a frame, unsigned
+   * int parameter
+   */
+  AOMD_GET_ORDER_HINT,
 };
 
 /*!\cond */

diff --git a/aom/internal/aom_codec_internal.h b/aom/internal/aom_codec_internal.h
index 0ad33bd..457da92 100644
--- a/aom/internal/aom_codec_internal.h
+++ b/aom/internal/aom_codec_internal.h

@@ -278,7 +278,7 @@
 typedef aom_image_t *(*aom_codec_get_preview_frame_fn_t)(
     aom_codec_alg_priv_t *ctx);
 
-/*!\brief Decoder algorithm interface interface
+/*!\brief Decoder algorithm interface
  *
  * All decoders \ref MUST expose a variable of this type.
  */

diff --git a/aom/src/aom_encoder.c b/aom/src/aom_encoder.c
index bb51c93..5dfda96 100644
--- a/aom/src/aom_encoder.c
+++ b/aom/src/aom_encoder.c

@@ -50,7 +50,11 @@
     res = AOM_CODEC_INCAPABLE;
   else if ((flags & AOM_CODEC_USE_PSNR) && !(iface->caps & AOM_CODEC_CAP_PSNR))
     res = AOM_CODEC_INCAPABLE;
-  else {
+  else if (cfg->g_bit_depth > 8 && (flags & AOM_CODEC_USE_HIGHBITDEPTH) == 0) {
+    res = AOM_CODEC_INVALID_PARAM;
+    ctx->err_detail =
+        "High bit-depth used without the AOM_CODEC_USE_HIGHBITDEPTH flag.";
+  } else {
     ctx->iface = iface;
     ctx->name = iface->name;
     ctx->priv = NULL;

diff --git a/aom/src/aom_image.c b/aom/src/aom_image.c
index 13f71b2..3c28263 100644
--- a/aom/src/aom_image.c
+++ b/aom/src/aom_image.c

@@ -9,6 +9,7 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -200,9 +201,8 @@
 
 int aom_img_set_rect(aom_image_t *img, unsigned int x, unsigned int y,
                      unsigned int w, unsigned int h, unsigned int border) {
-  unsigned char *data;
-
-  if (x + w <= img->w && y + h <= img->h) {
+  if (x <= UINT_MAX - w && x + w <= img->w && y <= UINT_MAX - h &&
+      y + h <= img->h) {
     img->d_w = w;
     img->d_h = h;
 
@@ -216,7 +216,7 @@
     } else {
       const int bytes_per_sample =
           (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1;
-      data = img->img_data;
+      unsigned char *data = img->img_data;
 
       img->planes[AOM_PLANE_Y] =
           data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y];

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index cf7072d..15a7615 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake

@@ -34,6 +34,8 @@
             "${AOM_ROOT}/aom_dsp/intrapred.c"
             "${AOM_ROOT}/aom_dsp/intrapred_common.h"
             "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.h"
             "${AOM_ROOT}/aom_dsp/prob.h"
             "${AOM_ROOT}/aom_dsp/recenter.h"
             "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
@@ -187,8 +189,10 @@
               "${AOM_ROOT}/aom_dsp/quantize.c"
               "${AOM_ROOT}/aom_dsp/quantize.h"
               "${AOM_ROOT}/aom_dsp/sad.c"
-              "${AOM_ROOT}/aom_dsp/sse.c"
               "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sse.c"
+              "${AOM_ROOT}/aom_dsp/ssim.c"
+              "${AOM_ROOT}/aom_dsp/ssim.h"
               "${AOM_ROOT}/aom_dsp/sum_squares.c"
               "${AOM_ROOT}/aom_dsp/variance.c"
               "${AOM_ROOT}/aom_dsp/variance.h")
@@ -311,8 +315,7 @@
 
   if(CONFIG_INTERNAL_STATS)
     list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
-                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
-                "${AOM_ROOT}/aom_dsp/ssim.h")
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c")
   endif()
 
   if(CONFIG_TUNE_VMAF)

diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index 150d35d..efb634a 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h

@@ -21,6 +21,8 @@
 extern "C" {
 #endif
 
+#define PI 3.141592653589793238462643383279502884
+
 #ifndef MAX_SB_SIZE
 #define MAX_SB_SIZE 128
 #endif  // ndef MAX_SB_SIZE

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 96375df..b39bfaa 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -1113,7 +1113,7 @@
   specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
 
   add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_lp_16x16 avx2 neon/;
+  specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
 
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
@@ -1127,25 +1127,25 @@
     specialize qw/aom_highbd_hadamard_32x32 avx2/;
   }
   add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
-  specialize qw/aom_satd neon avx2/;
+  specialize qw/aom_satd neon sse2 avx2/;
 
   add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
-  specialize qw/aom_satd_lp avx2 neon/;
+  specialize qw/aom_satd_lp sse2 avx2 neon/;
 
 
   #
   # Structured Similarity (SSIM)
   #
-  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+  add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
 
+  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
     add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
     specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+  }
 
-    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-      add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    }
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
   }
 }  # CONFIG_AV1_ENCODER
 
@@ -1155,7 +1155,6 @@
   # Specialty Variance
   #
   add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
   add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
 
   specialize qw/aom_get16x16var                neon msa/;
@@ -1190,51 +1189,6 @@
   #
   #
   #
-  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
-  specialize qw/aom_upsampled_pred sse2/;
-
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                   int ref_stride, int subpel_search";
-  specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
-  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
-
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-                                                       int subpel_search";
-  specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
-  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
-                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_upsampled_pred sse2/;
-
-    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-
-    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
-    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
-  }
-
-  #
-  #
-  #
   add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
   add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 

diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index c3d4de2..616e395 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c

@@ -12,9 +12,9 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 unsigned int aom_avg_4x4_neon(const uint8_t *a, int a_stride) {
   const uint8x16_t b = load_unaligned_u8q(a, a_stride);

diff --git a/aom_dsp/arm/blend_a64_mask_neon.c b/aom_dsp/arm/blend_a64_mask_neon.c
index e7f08a5..8709e38 100644
--- a/aom_dsp/arm/blend_a64_mask_neon.c
+++ b/aom_dsp/arm/blend_a64_mask_neon.c

@@ -15,8 +15,8 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,

diff --git a/aom_dsp/arm/fwd_txfm_neon.c b/aom_dsp/arm/fwd_txfm_neon.c
index ce93523..7fccdab 100644
--- a/aom_dsp/arm/fwd_txfm_neon.c
+++ b/aom_dsp/arm/fwd_txfm_neon.c

@@ -14,8 +14,8 @@
 #include "config/aom_config.h"
 
 #include "aom_dsp/txfm_common.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static void aom_fdct4x4_helper(const int16_t *input, int stride,
                                int16x4_t *input_0, int16x4_t *input_1,

diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
index 929792a..7897155 100644
--- a/aom_dsp/arm/hadamard_neon.c
+++ b/aom_dsp/arm/hadamard_neon.c

@@ -12,8 +12,8 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
                                  int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,

diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 6d41708..945e7e4 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c

@@ -11,8 +11,6 @@
 
 #include <arm_neon.h>
 
-#include "common/tools_common.h"
-
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 

diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index aafac89..95d3a17 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c

@@ -15,8 +15,8 @@
 #include "config/aom_config.h"
 
 #include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
                                  uint8x8_t p0q0, const uint8_t blimit,

diff --git a/av1/common/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
similarity index 98%
rename from av1/common/arm/mem_neon.h
rename to aom_dsp/arm/mem_neon.h
index 171055f..c8236da 100644
--- a/av1/common/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
-#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
+#define AOM_AOM_DSP_ARM_MEM_NEON_H_
 
 #include <arm_neon.h>
 #include <string.h>
@@ -536,4 +536,4 @@
   vst1q_s32(buf + 4, v1);
 }
 
-#endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_

diff --git a/aom_dsp/arm/sse_neon.c b/aom_dsp/arm/sse_neon.c
index 1f73443..35b784a 100644
--- a/aom_dsp/arm/sse_neon.c
+++ b/aom_dsp/arm/sse_neon.c

@@ -12,9 +12,9 @@
 
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/sum_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 
 static INLINE void sse_w16_neon(uint32x4_t *sum, const uint8_t *a,
                                 const uint8_t *b) {

diff --git a/aom_dsp/arm/sum_squares_neon.c b/aom_dsp/arm/sum_squares_neon.c
index 1ce12ec..0b7337a 100644
--- a/aom_dsp/arm/sum_squares_neon.c
+++ b/aom_dsp/arm/sum_squares_neon.c

@@ -12,7 +12,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src,

diff --git a/av1/common/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
similarity index 99%
rename from av1/common/arm/transpose_neon.h
rename to aom_dsp/arm/transpose_neon.h
index 91d89b4..ed513f6 100644
--- a/av1/common/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
-#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -599,4 +599,4 @@
   *a3 = c1.val[1];
 }
 
-#endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_

diff --git a/aom_dsp/binary_codes_reader.c b/aom_dsp/binary_codes_reader.c
index 7cd903d..ee0ce62 100644
--- a/aom_dsp/binary_codes_reader.c
+++ b/aom_dsp/binary_codes_reader.c

@@ -11,7 +11,6 @@
 
 #include "aom_dsp/binary_codes_reader.h"
 #include "aom_dsp/recenter.h"
-#include "av1/common/common.h"
 
 uint16_t aom_read_primitive_quniform_(aom_reader *r,
                                       uint16_t n ACCT_STR_PARAM) {

diff --git a/aom_dsp/binary_codes_writer.c b/aom_dsp/binary_codes_writer.c
index adf1c13..55ce842 100644
--- a/aom_dsp/binary_codes_writer.c
+++ b/aom_dsp/binary_codes_writer.c

@@ -13,7 +13,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_dsp/recenter.h"
 #include "aom_ports/bitops.h"
-#include "av1/common/common.h"
 
 // Codes a symbol v in [-2^mag_bits, 2^mag_bits].
 // mag_bits is number of bits for magnitude. The alphabet is of size

diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index 255d98c..29321f9 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h

@@ -20,8 +20,8 @@
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/entdec.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/prob.h"
-#include "av1/common/odintrin.h"
 
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"

diff --git a/aom_dsp/entcode.h b/aom_dsp/entcode.h
index 7518879..526ca59 100644
--- a/aom_dsp/entcode.h
+++ b/aom_dsp/entcode.h

@@ -14,7 +14,7 @@
 
 #include <limits.h>
 #include <stddef.h>
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/prob.h"
 
 #define EC_PROB_SHIFT 6

diff --git a/aom_dsp/fastssim.c b/aom_dsp/fastssim.c
index 3804519..ea58048 100644
--- a/aom_dsp/fastssim.c
+++ b/aom_dsp/fastssim.c

@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
 
 typedef struct fs_level fs_level;
 typedef struct fs_ctx fs_ctx;
@@ -31,6 +30,7 @@
 #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
 #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
 #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
 
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
@@ -467,7 +467,6 @@
                          uint32_t in_bd) {
   double ssimv;
   uint32_t bd_shift = 0;
-  aom_clear_system_state();
   assert(bd >= in_bd);
   assert(source->flags == dest->flags);
   int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;

diff --git a/aom_dsp/grain_synthesis.c b/aom_dsp/grain_synthesis.c
index 626eb76..d476aff 100644
--- a/aom_dsp/grain_synthesis.c
+++ b/aom_dsp/grain_synthesis.c

@@ -914,7 +914,7 @@
   }
 }
 
-int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
+int aom_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
                        aom_image_t *dst) {
   uint8_t *luma, *cb, *cr;
   int height, width, luma_stride, chroma_stride;
@@ -1015,12 +1015,12 @@
   luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
   chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
 
-  return av1_add_film_grain_run(
+  return aom_add_film_grain_run(
       params, luma, cb, cr, height, width, luma_stride, chroma_stride,
       use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
 }
 
-int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
+int aom_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
                            uint8_t *cb, uint8_t *cr, int height, int width,
                            int luma_stride, int chroma_stride,
                            int use_high_bit_depth, int chroma_subsamp_y,

diff --git a/aom_dsp/grain_synthesis.h b/aom_dsp/grain_synthesis.h
index 9155b39..62f44f5 100644
--- a/aom_dsp/grain_synthesis.h
+++ b/aom_dsp/grain_synthesis.h

@@ -31,7 +31,7 @@
  */
 typedef struct {
   // This structure is compared element-by-element in the function
-  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // aom_check_grain_params_equiv: this function must be updated if any changes
   // are made to this structure.
   int apply_grain;
 
@@ -85,7 +85,7 @@
 
   uint16_t random_seed;
   // This structure is compared element-by-element in the function
-  // av1_check_grain_params_equiv: this function must be updated if any changes
+  // aom_check_grain_params_equiv: this function must be updated if any changes
   // are made to this structure.
 } aom_film_grain_t;
 
@@ -98,7 +98,7 @@
  * \param[in]    pb               The second set of parameters to compare
  * \return       Returns 1 if the params are equivalent, 0 otherwise
  */
-static INLINE int av1_check_grain_params_equiv(
+static INLINE int aom_check_grain_params_equiv(
     const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
   if (pa->apply_grain != pb->apply_grain) return 0;
   // Don't compare update_parameters
@@ -166,7 +166,7 @@
  * \param[in]    luma_stride      luma plane stride
  * \param[in]    chroma_stride    chroma plane stride
  */
-int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
+int aom_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
                            uint8_t *cb, uint8_t *cr, int height, int width,
                            int luma_stride, int chroma_stride,
                            int use_high_bit_depth, int chroma_subsamp_y,
@@ -182,7 +182,7 @@
  * \param[in]    src              Source image
  * \param[out]   dst              Resulting image with grain
  */
-int av1_add_film_grain(const aom_film_grain_t *grain_params,
+int aom_add_film_grain(const aom_film_grain_t *grain_params,
                        const aom_image_t *src, aom_image_t *dst);
 
 #ifdef __cplusplus

diff --git a/aom_dsp/grain_table.c b/aom_dsp/grain_table.c
index 66c604e..b22752a 100644
--- a/aom_dsp/grain_table.c
+++ b/aom_dsp/grain_table.c

@@ -202,7 +202,7 @@
                                 int64_t end_time, int erase,
                                 aom_film_grain_t *grain) {
   aom_film_grain_table_entry_t *entry = t->head;
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   uint16_t random_seed = grain ? grain->random_seed : 0;
   if (grain) memset(grain, 0, sizeof(*grain));
 
@@ -241,7 +241,7 @@
         entry->end_time = time_stamp;
         if (t->tail == entry) t->tail = new_entry;
       }
-      // If segments aren't aligned, delete from the beggining of subsequent
+      // If segments aren't aligned, delete from the beginning of subsequent
       // segments
       if (end_time > entry_end_time) {
         aom_film_grain_table_lookup(t, entry_end_time, end_time, 1, 0);
@@ -275,12 +275,12 @@
     return error_info->error_code;
   }
 
-  aom_film_grain_table_entry_t *prev_entry = 0;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
   while (!feof(file)) {
     aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
     memset(entry, 0, sizeof(*entry));
     grain_table_entry_read(file, error_info, entry);
-    entry->next = 0;
+    entry->next = NULL;
 
     if (prev_entry) prev_entry->next = entry;
     if (!t->head) t->head = entry;

diff --git a/av1/encoder/mathutils.h b/aom_dsp/mathutils.h
similarity index 97%
rename from av1/encoder/mathutils.h
rename to aom_dsp/mathutils.h
index 576de07..a52a2df 100644
--- a/av1/encoder/mathutils.h
+++ b/aom_dsp/mathutils.h

@@ -9,14 +9,15 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
-#define AOM_AV1_ENCODER_MATHUTILS_H_
+#ifndef AOM_AOM_DSP_MATHUTILS_H_
+#define AOM_AOM_DSP_MATHUTILS_H_
 
-#include <memory.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
 #include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
 
 static const double TINY_NEAR_ZERO = 1.0E-16;
 
@@ -85,7 +86,7 @@
     for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
   }
   int ret = linsolve(n, AtA, n, Atb, x);
-  if (scratch_) aom_free(scratch_);
+  aom_free(scratch_);
   return ret;
 }
 
@@ -356,4 +357,4 @@
   return 0;
 }
 
-#endif  // AOM_AV1_ENCODER_MATHUTILS_H_
+#endif  // AOM_AOM_DSP_MATHUTILS_H_

diff --git a/aom_dsp/noise_model.c b/aom_dsp/noise_model.c
index 0b74009..5930199 100644
--- a/aom_dsp/noise_model.c
+++ b/aom_dsp/noise_model.c

@@ -15,11 +15,10 @@
 #include <string.h>
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
 #include "aom_dsp/noise_model.h"
 #include "aom_dsp/noise_util.h"
 #include "aom_mem/aom_mem.h"
-#include "av1/common/common.h"
-#include "av1/encoder/mathutils.h"
 
 #define kLowPolyNumParams 3
 
@@ -214,7 +213,7 @@
 
 int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
   if (!lut) return 0;
-  if (num_points < 0) return 0;
+  if (num_points <= 0) return 0;
   lut->num_points = 0;
   lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
   if (!lut->points) return 0;
@@ -1153,12 +1152,24 @@
 
   // Convert the scaling functions to 8 bit values
   aom_noise_strength_lut_t scaling_points[3];
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[0].strength_solver, 14,
+          scaling_points + 0)) {
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[1].strength_solver, 10,
+          scaling_points + 1)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[2].strength_solver, 10,
+          scaling_points + 2)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    aom_noise_strength_lut_free(scaling_points + 1);
+    return 0;
+  }
 
   // Both the domain and the range of the scaling functions in the film_grain
   // are normalized to 8-bit (e.g., they are implicitly scaled during grain

diff --git a/av1/common/odintrin.c b/aom_dsp/odintrin.c
similarity index 99%
rename from av1/common/odintrin.c
rename to aom_dsp/odintrin.c
index 7584b2e..eb6d8d8 100644
--- a/av1/common/odintrin.c
+++ b/aom_dsp/odintrin.c

@@ -11,7 +11,7 @@
 
 /* clang-format off */
 
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 /*Constants for use with OD_DIVU_SMALL().
   See \cite{Rob05} for details on computing these constants.

diff --git a/av1/common/odintrin.h b/aom_dsp/odintrin.h
similarity index 95%
rename from av1/common/odintrin.h
rename to aom_dsp/odintrin.h
index e1db0f4..20a7f58 100644
--- a/av1/common/odintrin.h
+++ b/aom_dsp/odintrin.h

@@ -11,8 +11,8 @@
 
 /* clang-format off */
 
-#ifndef AOM_AV1_COMMON_ODINTRIN_H_
-#define AOM_AV1_COMMON_ODINTRIN_H_
+#ifndef AOM_AOM_DSP_ODINTRIN_H_
+#define AOM_AOM_DSP_ODINTRIN_H_
 
 #include <stdlib.h>
 #include <string.h>
@@ -20,7 +20,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/bitops.h"
-#include "av1/common/enums.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -93,4 +92,4 @@
 }  // extern "C"
 #endif
 
-#endif  // AOM_AV1_COMMON_ODINTRIN_H_
+#endif  // AOM_AOM_DSP_ODINTRIN_H_

diff --git a/aom_dsp/psnrhvs.c b/aom_dsp/psnrhvs.c
index 69a1d99..966ba00 100644
--- a/aom_dsp/psnrhvs.c
+++ b/aom_dsp/psnrhvs.c

@@ -22,7 +22,6 @@
 
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
 
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
@@ -34,6 +33,7 @@
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                                int xstride) {
   int i, j;
@@ -43,6 +43,7 @@
     for (j = 0; j < 8; j++)
       *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
@@ -210,6 +211,7 @@
         }
       }
       s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
       if (!buf_is_hbd) {
         od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
@@ -217,6 +219,10 @@
         hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
         hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
       }
+#else
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
@@ -246,7 +252,6 @@
   const double par = 1.0;
   const int step = 7;
   uint32_t bd_shift = 0;
-  aom_clear_system_state();
   assert(bd == 8 || bd == 10 || bd == 12);
   assert(bd >= in_bd);
   assert(src->flags == dst->flags);

diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index edd4d96..36ca58f 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c

@@ -11,7 +11,6 @@
 
 #include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
-#include "av1/encoder/av1_quantize.h"
 
 void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,

diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index 3956318..efe253d 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h

@@ -20,6 +20,9 @@
 extern "C" {
 #endif
 
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
 void aom_quantize_b_adaptive_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,

diff --git a/aom_dsp/ssim.c b/aom_dsp/ssim.c
index 357da99..35d493b 100644
--- a/aom_dsp/ssim.c
+++ b/aom_dsp/ssim.c

@@ -16,8 +16,8 @@
 
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
+#if CONFIG_INTERNAL_STATS
 void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                             uint32_t *sum_s, uint32_t *sum_r,
                             uint32_t *sum_sq_s, uint32_t *sum_sq_r,
@@ -33,6 +33,7 @@
     }
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
 
 void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
@@ -49,24 +50,6 @@
   }
 }
 
-#if CONFIG_AV1_HIGHBITDEPTH
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
-                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
-                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                                 uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-#endif
-
 static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
 static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
@@ -78,7 +61,7 @@
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
   double ssim_n, ssim_d;
-  int64_t c1, c2;
+  int64_t c1 = 0, c2 = 0;
   if (bd == 8) {
     // scale the constants by number of pixels
     c1 = (cc1 * count * count) >> 12;
@@ -90,8 +73,9 @@
     c1 = (cc1_12 * count * count) >> 12;
     c2 = (cc2_12 * count * count) >> 12;
   } else {
-    c1 = c2 = 0;
     assert(0);
+    // Return similarity as zero for unsupported bit-depth values.
+    return 0;
   }
 
   ssim_n = (2.0 * sum_s * sum_r + c1) *
@@ -111,21 +95,11 @@
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                            &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -143,31 +117,10 @@
   return ssim_total;
 }
 
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
-                               int stride_img1, int stride_img2, int width,
-                               int height, uint32_t bd, uint32_t shift) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim) {
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim) {
   double abc[3];
   for (int i = 0; i < 3; ++i) {
     const int is_uv = i > 0;
@@ -273,7 +226,6 @@
   int c = 0;
   double norm;
   double old_ssim_total = 0;
-  aom_clear_system_state();
   // We can sample points as frequently as we like start with 1 per 4x4.
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
@@ -421,7 +373,57 @@
   m->dssim = dssim_total;
   return inconsistency_total;
 }
+#endif  // CONFIG_INTERNAL_STATS
 
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim) {
@@ -455,3 +457,25 @@
     fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
   }
 }
+#endif  // CONFIG_INTERNAL_STATS
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_hbd) {
+    aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                         frame_ssim2);
+    return;
+  }
+#else
+  (void)bit_depth;
+  (void)in_bit_depth;
+  (void)is_hbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
+}
+#endif  // CONFIG_INTERNAL_STATS

diff --git a/aom_dsp/ssim.h b/aom_dsp/ssim.h
index d635ef5..fb92556 100644
--- a/aom_dsp/ssim.h
+++ b/aom_dsp/ssim.h

@@ -12,14 +12,13 @@
 #ifndef AOM_AOM_DSP_SSIM_H_
 #define AOM_AOM_DSP_SSIM_H_
 
-#define MAX_SSIM_DB 100.0;
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #include "config/aom_config.h"
 
+#if CONFIG_INTERNAL_STATS
 #include "aom_scale/yv12config.h"
 
 // metrics used for calculating ssim, ssim2, dssim, and ssimc
@@ -68,18 +67,35 @@
                             int img2_pitch, int width, int height, Ssimv *sv2,
                             Metrics *m, int do_inconsistency);
 
-void aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *weight,
-                   double *fast_ssim);
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim);
 
 double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest, double *ssim_y,
                          double *ssim_u, double *ssim_v, uint32_t bd,
                          uint32_t in_bd);
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                           const YV12_BUFFER_CONFIG *dest, double *weight,
                           uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2);
+#endif  // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index f13d690..67d9e90 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h

@@ -13,7 +13,6 @@
 #define AOM_AOM_DSP_TXFM_COMMON_H_
 
 #include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
 
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
@@ -22,6 +21,71 @@
 #define UNIT_QUANT_SHIFT 2
 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
 
+// block transform size
+enum {
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_32X64,           // 32x64 transform
+  TX_64X32,           // 64x32 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+} UENUM1BYTE(TX_SIZE);
+
+enum {
+  DCT_DCT,            // DCT in both horizontal and vertical
+  ADST_DCT,           // ADST in vertical, DCT in horizontal
+  DCT_ADST,           // DCT in vertical, ADST in horizontal
+  ADST_ADST,          // ADST in both directions
+  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
+  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
+  FLIPADST_FLIPADST,  // FLIPADST in both directions
+  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
+  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
+  IDTX,               // Identity in both directions
+  V_DCT,              // DCT in vertical, identity in horizontal
+  H_DCT,              // Identity in vertical, DCT in horizontal
+  V_ADST,             // ADST in vertical, identity in horizontal
+  H_ADST,             // Identity in vertical, ADST in horizontal
+  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
+  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
+  TX_TYPES,
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
+  TX_TYPE_INVALID = 255,      // Invalid transform type
+} UENUM1BYTE(TX_TYPE);
+
+enum {
+  // DCT only
+  EXT_TX_SET_DCTONLY,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16,
+  EXT_TX_SET_TYPES
+} UENUM1BYTE(TxSetType);
+
 typedef struct txfm_param {
   // for both forward and inverse transforms
   TX_TYPE tx_type;

diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 20af52b..cb9356e 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c

@@ -14,7 +14,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -23,10 +22,8 @@
 #include "aom_dsp/blend.h"
 #include "aom_dsp/variance.h"
 
-#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
 #include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -284,101 +281,6 @@
   }
 }
 
-// Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                          int mi_row, int mi_col, const MV *const mv,
-                          uint8_t *comp_pred, int width, int height,
-                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                          int ref_stride, int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                   int mi_row, int mi_col, const MV *const mv,
-                                   uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, int subpel_x_q3,
-                                   int subpel_y_q3, const uint8_t *ref,
-                                   int ref_stride, int subpel_search) {
-  int i, j;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
                                   int width, int height, const uint8_t *ref,
                                   int ref_stride,
@@ -399,30 +301,6 @@
   }
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 #if CONFIG_AV1_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int a_stride,
                               const uint8_t *b8, int b_stride, int w, int h,
@@ -831,107 +709,6 @@
   }
 }
 
-void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
-                                 const struct AV1Common *const cm, int mi_row,
-                                 int mi_col, const MV *const mv,
-                                 uint8_t *comp_pred8, int width, int height,
-                                 int subpel_x_q3, int subpel_y_q3,
-                                 const uint8_t *ref8, int ref_stride, int bd,
-                                 int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
-                                 16, NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                                kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
-                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                                 intermediate_height, bd);
-    aom_highbd_convolve8_vert_c(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  int i, j;
-
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
 void aom_highbd_dist_wtd_comp_avg_pred_c(
     uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
     const uint8_t *ref8, int ref_stride,
@@ -954,32 +731,6 @@
     ref += ref_stride;
   }
 }
-
-void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                              height, subpel_x_q3, subpel_y_q3, ref8,
-                              ref_stride, bd, subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint16_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
@@ -1002,25 +753,6 @@
   }
 }
 
-void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask,
-                                    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
-                       mask_stride, invert_mask);
-}
-
 #define MASK_SUBPIX_VAR(W, H)                                                  \
   unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
@@ -1091,19 +823,6 @@
   }
 }
 
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
-                            mask, mask_stride, invert_mask);
-}
-
 #define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
   unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,            \

diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index 428afd0..dae4197 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h

@@ -69,13 +69,6 @@
     const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
     const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
 
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search);
-
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
                                           const int32_t *msk);

diff --git a/aom_dsp/vmaf.c b/aom_dsp/vmaf.c
index 219e278..a93a496 100644
--- a/aom_dsp/vmaf.c
+++ b/aom_dsp/vmaf.c

@@ -23,7 +23,6 @@
 
 #include <libvmaf/libvmaf.h>
 #include "aom_dsp/blend.h"
-#include "aom_ports/system_state.h"
 
 static void vmaf_fatal_error(const char *message) {
   fprintf(stderr, "Fatal error: %s\n", message);

diff --git a/aom_dsp/x86/adaptive_quantize_avx2.c b/aom_dsp/x86/adaptive_quantize_avx2.c
index e33dff2..b3dede7 100644
--- a/aom_dsp/x86/adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/adaptive_quantize_avx2.c

@@ -12,7 +12,7 @@
 #include <immintrin.h>
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,

diff --git a/aom_dsp/x86/adaptive_quantize_sse2.c b/aom_dsp/x86/adaptive_quantize_sse2.c
index 584cd67..503b9b4 100644
--- a/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/adaptive_quantize_sse2.c

@@ -13,7 +13,7 @@
 #include <emmintrin.h>
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
-#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
 void aom_quantize_b_adaptive_sse2(

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 260ca2a..a52abd0 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c

@@ -272,8 +272,8 @@
   hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
 }
 
-void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
-                              int16_t *coeff) {
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+                                        ptrdiff_t src_stride, int16_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -304,6 +304,50 @@
   _mm_store_si128((__m128i *)coeff, src[7]);
 }
 
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    _mm_store_si128((__m128i *)t_coeff, coeff0);
+    _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+    _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+    t_coeff += 8;
+  }
+}
+
 static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                        ptrdiff_t src_stride, tran_low_t *coeff,
                                        int is_final) {
@@ -416,17 +460,50 @@
 int aom_satd_sse2(const tran_low_t *coeff, int length) {
   int i;
   const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
   __m128i accum = zero;
 
-  for (i = 0; i < length; i += 8) {
-    const __m128i src_line = load_tran_low(coeff);
-    const __m128i inv = _mm_sub_epi16(zero, src_line);
-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
-    accum = _mm_add_epi32(accum, sum);
-    coeff += 8;
+  for (i = 0; i < length; i += 16) {
+    const __m128i src_line0 = load_tran_low(coeff);
+    const __m128i src_line1 = load_tran_low(coeff + 8);
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i accum = zero;
+
+  for (int i = 0; i < length; i += 16) {
+    const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
   }
 
   {  // cascading summation of accum

diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 9312e9e..9e8662a 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h

@@ -99,22 +99,4 @@
   ss[5] = _mm_unpackhi_epi8(s[10], zero);
   return convolve_12tap(ss, coeffs);
 }
-
-void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_y,
-                                  int subpel_y_qn);
-
-void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                  uint8_t *dst, int dst_stride, int w, int h,
-                                  const InterpFilterParams *filter_params_x,
-                                  int subpel_x_qn, ConvolveParams *conv_params);
-
-void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
-                                   uint8_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_qn, const int subpel_y_qn,
-                                   ConvolveParams *conv_params);
-
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_

diff --git a/aom_dsp/x86/convolve_sse2.h b/aom_dsp/x86/convolve_sse2.h
index 6014895..36b7d62 100644
--- a/aom_dsp/x86/convolve_sse2.h
+++ b/aom_dsp/x86/convolve_sse2.h

@@ -9,16 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "av1/common/resize.h"
-#include "config/av1_rtcd.h"
-#include "config/aom_scale_rtcd.h"
-
 #ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 #define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
 
+#include "config/aom_scale_rtcd.h"
+
 // Note:
 //  This header file should be put below any x86 intrinsics head file
-
 static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
                                   const int subpel_q4,
                                   __m128i *const coeffs /* [4] */) {

diff --git a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
index c500b0a..05c87bc 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_avx2.c

@@ -14,10 +14,9 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-#include "av1/encoder/av1_quantize.h"
-
 static INLINE void highbd_load_b_values_avx2(
     const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
     __m256i *round, const int16_t *quant_ptr, __m256i *quant,

diff --git a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
index 8f31f35..ae31116 100644
--- a/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/highbd_adaptive_quantize_sse2.c

@@ -13,8 +13,8 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
 #include "aom_dsp/x86/quantize_x86.h"
-#include "av1/encoder/av1_quantize.h"
 
 static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);

diff --git a/aom_dsp/x86/highbd_sad_sse2.asm b/aom_dsp/x86/highbd_sad_sse2.asm
index 58f1ac9..a2510d5 100644
--- a/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/aom_dsp/x86/highbd_sad_sse2.asm

@@ -20,20 +20,21 @@
 ; Arg 2: Height
 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
-%macro HIGH_SAD_FN 4
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
 %if %4 == 0
 %if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
 %elif %4 == 1 ; avg
 %if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
 %else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, %5, src, src_stride, \
                                               ref, ref_stride, \
                                               second_pred, \
                                               src_stride3, ref_stride3
@@ -356,7 +357,7 @@
 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
+  HIGH_SAD_FN 8, %1, 7, %2, 8
 %if %2 == 2  ; skip rows, so divide number of rows by 2
   mov              n_rowsd, %1/8
 %else
@@ -377,22 +378,30 @@
   pavgw                 m4, [second_predq+mmsize*3]
   lea         second_predq, [second_predq+mmsize*4]
 %endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
   por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
   por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
   por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
   por                   m4, m5
+
   paddw                 m1, m2
   paddw                 m3, m4
   movhlps               m2, m1

diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index d1bd7d4..ae36d73 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c

@@ -14,16 +14,12 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
-
 #include "aom_ports/mem.h"
 
-#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
 #include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -613,133 +609,6 @@
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
-                                    const struct AV1Common *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred8, int width, int height,
-                                    int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride, int bd,
-                                    int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      /*Read 8 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
-          comp_pred += 8;
-          ref += 8;
-        }
-        ref += ref_stride - width;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      /*Read 4 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
-        comp_pred += 8;
-        ref += 2 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
-                               NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                              kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
-    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
-                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
-                                     : temp;
-    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(
-        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
-        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
-    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
-                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
-                              height, bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
-  assert(!(width * height & 7));
-  int n = width * height >> 3;
-  for (int i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
-
 static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
                                                     const __m128i *w0,
                                                     const __m128i *w1,
@@ -806,41 +675,6 @@
   }
 }
 
-void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  int n;
-  int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  assert(!(width * height & 7));
-  n = width * height >> 3;
-
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred16);
-    __m128i p1 = xx_loadu_128(pred);
-
-    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
-
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
-
 uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
                                        uint16_t *src, int sstride, int h) {
   uint64_t sum = 0;

diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 58789c3..ebd7c1b 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c

@@ -2529,7 +2529,7 @@
   a16 = _mm256_set1_epi32(16);
   c1 = _mm256_srli_epi32(a16, 4);
   c8 = _mm256_srli_epi32(a16, 1);
-  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
   c3f = _mm256_set1_epi32(0x3f);
   dy256 = _mm256_set1_epi32(dy);
   c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);

diff --git a/aom_dsp/x86/jnt_sad_ssse3.c b/aom_dsp/x86/jnt_sad_ssse3.c
index 2e3e2be..4e6fe8f 100644
--- a/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/aom_dsp/x86/jnt_sad_ssse3.c

@@ -15,7 +15,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
 

diff --git a/aom_dsp/x86/jnt_variance_ssse3.c b/aom_dsp/x86/jnt_variance_ssse3.c
index b74f4bf..6ec5dd8 100644
--- a/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/aom_dsp/x86/jnt_variance_ssse3.c

@@ -15,7 +15,6 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/x86/synonyms.h"
 
@@ -116,38 +115,6 @@
   }
 }
 
-void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
 #define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
   uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
       const uint8_t *a, int a_stride, int xoffset, int yoffset,            \

diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
index ea57c9f..4105250 100644
--- a/aom_dsp/x86/txfm_common_avx2.h
+++ b/aom_dsp/x86/txfm_common_avx2.h

@@ -246,21 +246,19 @@
   }
 }
 
-static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
   __m256i tmp, round;
   round = _mm256_set1_epi32(1 << (bit - 1));
   tmp = _mm256_add_epi32(vec, round);
   return _mm256_srai_epi32(tmp, bit);
 }
 
-static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
-                                                 __m256i *output,
-                                                 const int size,
-                                                 const int bit) {
+static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+                                             const int size, const int bit) {
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
-      output[i] = av1_round_shift_32_avx2(input[i], bit);
+      output[i] = round_shift_32_avx2(input[i], bit);
     }
   } else {
     int i;
@@ -270,25 +268,24 @@
   }
 }
 
-static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
-                                                      __m256i *output,
-                                                      const int size,
-                                                      const int bit,
-                                                      const int val) {
+static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+                                                  __m256i *output,
+                                                  const int size, const int bit,
+                                                  const int val) {
   const __m256i sqrt2 = _mm256_set1_epi32(val);
   if (bit > 0) {
     int i;
     for (i = 0; i < size; i++) {
-      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r0 = round_shift_32_avx2(input[i], bit);
       const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
-      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
     }
   } else {
     int i;
     for (i = 0; i < size; i++) {
       const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
       const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
-      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
     }
   }
 }

diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index f779270..163e4cc 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c

@@ -616,7 +616,7 @@
         src += src_stride;
         dst += dst_stride;
       }
-    } else if (y_offset == 8) {
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -652,8 +652,8 @@
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -668,8 +668,8 @@
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -691,7 +691,7 @@
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -741,8 +741,8 @@
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(

diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index e372a4b..49919cc 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c

@@ -14,19 +14,12 @@
 
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
 
 #include "aom_dsp/blend.h"
 #include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms.h"
-
 #include "aom_ports/mem.h"
 
-#include "av1/common/av1_common_int.h"
-#include "av1/common/filter.h"
-#include "av1/common/reconinter.h"
-#include "av1/encoder/reconinter_enc.h"
-
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
   int i;
@@ -539,162 +532,6 @@
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
-                             int mi_row, int mi_col, const MV *const mv,
-                             uint8_t *comp_pred, int width, int height,
-                             int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride,
-                             int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      int plane = 0;
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-
-      InterPredParams inter_pred_params;
-      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
-      const int_interpfilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-      av1_init_inter_params(
-          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
-          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
-          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
-      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
-                                        &inter_pred_params);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter = av1_get_filter(subpel_search);
-  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
-  // 2-tap yet.
-  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    if (width >= 16) {
-      int i;
-      assert(!(width & 15));
-      /*Read 16 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 16) {
-          xx_storeu_128(comp_pred, xx_loadu_128(ref));
-          comp_pred += 16;
-          ref += 16;
-        }
-        ref += ref_stride - width;
-      }
-    } else if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      assert(!(height & 1));
-      /*Read 8 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
-        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
-        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
-        comp_pred += 16;
-        ref += 2 * ref_stride;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      assert(!(height & 3));
-      /*Read 4 pixels four rows at a time.*/
-      for (i = 0; i < height; i++) {
-        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
-        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
-        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
-        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
-        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
-                                               _mm_unpacklo_epi32(row2, row3));
-        xx_storeu_128(comp_pred, reg);
-        comp_pred += 16;
-        ref += 4 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
-                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
-                                    : temp;
-    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
-                        kernel_x, 16, NULL, -1, width, intermediate_height);
-    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
-                       kernel_y, 16, width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = xx_loadu_128(comp_pred);
-    __m128i p0 = xx_loadu_128(pred);
-    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
-}
-
 static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
                                                       const __m128i s1,
                                                       const __m128i a) {

diff --git a/aom_ports/aom_ports.cmake b/aom_ports/aom_ports.cmake
index d579896..e02157a 100644
--- a/aom_ports/aom_ports.cmake
+++ b/aom_ports/aom_ports.cmake

@@ -13,19 +13,14 @@
 endif() # AOM_AOM_PORTS_AOM_PORTS_CMAKE_
 set(AOM_AOM_PORTS_AOM_PORTS_CMAKE_ 1)
 
-list(APPEND AOM_PORTS_INCLUDES
-            "${AOM_ROOT}/aom_ports/aom_once.h"
-            "${AOM_ROOT}/aom_ports/aom_timer.h"
-            "${AOM_ROOT}/aom_ports/bitops.h"
+list(APPEND AOM_PORTS_INCLUDES "${AOM_ROOT}/aom_ports/aom_once.h"
+            "${AOM_ROOT}/aom_ports/aom_timer.h" "${AOM_ROOT}/aom_ports/bitops.h"
             "${AOM_ROOT}/aom_ports/emmintrin_compat.h"
-            "${AOM_ROOT}/aom_ports/mem.h"
-            "${AOM_ROOT}/aom_ports/mem_ops.h"
+            "${AOM_ROOT}/aom_ports/mem.h" "${AOM_ROOT}/aom_ports/mem_ops.h"
             "${AOM_ROOT}/aom_ports/mem_ops_aligned.h"
-            "${AOM_ROOT}/aom_ports/msvc.h"
-            "${AOM_ROOT}/aom_ports/sanitizer.h"
-            "${AOM_ROOT}/aom_ports/system_state.h")
+            "${AOM_ROOT}/aom_ports/msvc.h" "${AOM_ROOT}/aom_ports/sanitizer.h")
 
-list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/emms.asm")
+list(APPEND AOM_PORTS_ASM_X86 "${AOM_ROOT}/aom_ports/float.asm")
 
 list(APPEND AOM_PORTS_INCLUDES_X86 "${AOM_ROOT}/aom_ports/x86_abi_support.asm")
 
@@ -48,7 +43,7 @@
 #
 # * The libaom target must exist before this function is called.
 function(setup_aom_ports_targets)
-  if("${AOM_TARGET_CPU}" MATCHES "^x86")
+  if(WIN32 AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
     add_asm_library("aom_ports" "AOM_PORTS_ASM_X86")
     set(aom_ports_has_symbols 1)
   elseif("${AOM_TARGET_CPU}" MATCHES "arm")
@@ -69,8 +64,7 @@
   if(aom_ports_has_symbols)
     target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES})
 
-    if("${AOM_TARGET_CPU}" STREQUAL "x86"
-       OR "${AOM_TARGET_CPU}" STREQUAL "x86_64")
+    if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
       target_sources(aom_ports PRIVATE ${AOM_PORTS_INCLUDES_X86})
     endif()
 

diff --git a/aom_ports/emms.asm b/aom_ports/float.asm
similarity index 91%
rename from aom_ports/emms.asm
rename to aom_ports/float.asm
index 038635d..abff60a 100644
--- a/aom_ports/emms.asm
+++ b/aom_ports/float.asm

@@ -9,18 +9,10 @@
 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ;
 
-;
-
 
 %include "aom_ports/x86_abi_support.asm"
 
 section .text
-globalsym(aom_reset_mmx_state)
-sym(aom_reset_mmx_state):
-    emms
-    ret
-
-
 %if LIBAOM_YASM_WIN64
 globalsym(aom_winx64_fldcw)
 sym(aom_winx64_fldcw):

diff --git a/aom_ports/system_state.h b/aom_ports/system_state.h
deleted file mode 100644
index 46797e3..0000000
--- a/aom_ports/system_state.h
+++ /dev/null

@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_PORTS_SYSTEM_STATE_H_
-#define AOM_AOM_PORTS_SYSTEM_STATE_H_
-
-#include "config/aom_config.h"
-
-#if ARCH_X86 || ARCH_X86_64
-void aom_reset_mmx_state(void);
-#define aom_clear_system_state()
-#else
-#define aom_clear_system_state()
-#endif  // ARCH_X86 || ARCH_X86_64
-#endif  // AOM_AOM_PORTS_SYSTEM_STATE_H_

diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index cf1c3e6..1e42fa2 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h

@@ -164,15 +164,14 @@
 #define HAS_AVX2 0x80
 #define HAS_SSE4_2 0x100
 #ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << (n))
 #endif
 
 static INLINE int x86_simd_caps(void) {
   unsigned int flags = 0;
-  unsigned int mask = ~0;
+  unsigned int mask = ~0u;
   unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
   char *env;
-  (void)reg_ebx;
 
   /* See if the CPU capabilities are being overridden by the environment */
   env = getenv("AOM_SIMD_CAPS");
@@ -207,6 +206,7 @@
 
   // bits 27 (OSXSAVE) & 28 (256-bit AVX)
   if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    // Check for OS-support of YMM state. Necessary for AVX and AVX2.
     if ((xgetbv() & 0x6) == 0x6) {
       flags |= HAS_AVX;
 
@@ -219,12 +219,14 @@
     }
   }
 
+  (void)reg_eax;  // Avoid compiler warning on unused-but-set variable.
+
   return flags & mask;
 }
 
 // Fine-Grain Measurement Functions
 //
-// If you are a timing a small region of code, access the timestamp counter
+// If you are timing a small region of code, access the timestamp counter
 // (TSC) via:
 //
 // unsigned int start = x86_tsc_start();
@@ -302,14 +304,26 @@
 
 static INLINE unsigned int x86_tsc_start(void) {
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return x86_readtsc();
 }
 
 static INLINE unsigned int x86_tsc_end(void) {
   uint32_t v = x86_readtscp();
   unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  // This call should not be removed. See function notes above.
   cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  // Avoid compiler warnings on unused-but-set variables.
+  (void)reg_eax;
+  (void)reg_ebx;
+  (void)reg_ecx;
+  (void)reg_edx;
   return v;
 }
 
@@ -362,12 +376,21 @@
 
 static INLINE unsigned int x87_set_double_precision(void) {
   unsigned int mode = x87_get_control_word();
+  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
+  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
+  // 8.1.5.2 Precision Control Field
+  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
+  // determine the number of bits used in floating point calculations. To match
+  // later SSE instructions restrict x87 operations to Double Precision (0x200).
+  // Precision                     PC Field
+  // Single Precision (24-Bits)    00B
+  // Reserved                      01B
+  // Double Precision (53-Bits)    10B
+  // Extended Precision (64-Bits)  11B
   x87_set_control_word((mode & ~0x300) | 0x200);
   return mode;
 }
 
-extern void aom_reset_mmx_state(void);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/apps/aomdec.c b/apps/aomdec.c
index b1929a6..341c5bc 100644
--- a/apps/aomdec.c
+++ b/apps/aomdec.c

@@ -187,20 +187,22 @@
   size_t frame_size = 0;
 
   if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read RAW frame size\n");
+    if (!feof(infile)) aom_tools_warn("Failed to read RAW frame size\n");
   } else {
     const size_t kCorruptFrameThreshold = 256 * 1024 * 1024;
     const size_t kFrameTooSmallThreshold = 256 * 1024;
     frame_size = mem_get_le32(raw_hdr);
 
     if (frame_size > kCorruptFrameThreshold) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      aom_tools_warn("Read invalid frame size (%u)\n",
+                     (unsigned int)frame_size);
       frame_size = 0;
     }
 
     if (frame_size < kFrameTooSmallThreshold) {
-      warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
-           (unsigned int)frame_size);
+      aom_tools_warn(
+          "Warning: Read invalid frame size (%u) - not a raw file?\n",
+          (unsigned int)frame_size);
     }
 
     if (frame_size > *buffer_size) {
@@ -209,7 +211,7 @@
         *buffer = new_buf;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        aom_tools_warn("Failed to allocate compressed data buffer\n");
         frame_size = 0;
       }
     }
@@ -217,7 +219,7 @@
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      aom_tools_warn("Failed to read full frame\n");
       return 1;
     }
     *bytes_read = frame_size;
@@ -671,8 +673,8 @@
     fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
 
   if (interface && fourcc_interface && interface != fourcc_interface)
-    warn("Header indicates codec: %s\n",
-         aom_codec_iface_name(fourcc_interface));
+    aom_tools_warn("Header indicates codec: %s\n",
+                   aom_codec_iface_name(fourcc_interface));
   else
     interface = fourcc_interface;
 
@@ -760,10 +762,10 @@
 
         if (aom_codec_decode(&decoder, buf, bytes_in_buffer, NULL)) {
           const char *detail = aom_codec_error_detail(&decoder);
-          warn("Failed to decode frame %d: %s", frame_in,
-               aom_codec_error(&decoder));
+          aom_tools_warn("Failed to decode frame %d: %s", frame_in,
+                         aom_codec_error(&decoder));
 
-          if (detail) warn("Additional information: %s", detail);
+          if (detail) aom_tools_warn("Additional information: %s", detail);
           if (!keep_going) goto fail;
         }
 
@@ -771,8 +773,8 @@
           int qp;
           if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_LAST_QUANTIZER,
                                             &qp)) {
-            warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
-                 aom_codec_error(&decoder));
+            aom_tools_warn("Failed AOMD_GET_LAST_QUANTIZER: %s",
+                           aom_codec_error(&decoder));
             if (!keep_going) goto fail;
           }
           fprintf(framestats_file, "%d,%d\r\n", (int)bytes_in_buffer, qp);
@@ -792,7 +794,8 @@
     if (flush_decoder) {
       // Flush the decoder.
       if (aom_codec_decode(&decoder, NULL, 0, NULL)) {
-        warn("Failed to flush decoder: %s", aom_codec_error(&decoder));
+        aom_tools_warn("Failed to flush decoder: %s",
+                       aom_codec_error(&decoder));
       }
     }
 
@@ -806,7 +809,8 @@
 
       if (AOM_CODEC_CONTROL_TYPECHECKED(&decoder, AOMD_GET_FRAME_CORRUPTED,
                                         &corrupted)) {
-        warn("Failed AOM_GET_FRAME_CORRUPTED: %s", aom_codec_error(&decoder));
+        aom_tools_warn("Failed AOM_GET_FRAME_CORRUPTED: %s",
+                       aom_codec_error(&decoder));
         if (!keep_going) goto fail;
       }
       frames_corrupted += corrupted;

diff --git a/apps/aomenc.c b/apps/aomenc.c
index b86b80a..f65efa3 100644
--- a/apps/aomenc.c
+++ b/apps/aomenc.c

@@ -227,6 +227,10 @@
 #if CONFIG_TUNE_VMAF
                                         AV1E_SET_VMAF_MODEL_PATH,
 #endif
+                                        AV1E_SET_DV_COST_UPD_FREQ,
+                                        AV1E_SET_PARTITION_INFO_PATH,
+                                        AV1E_SET_ENABLE_DIRECTIONAL_INTRA,
+                                        AV1E_SET_ENABLE_TX_SIZE_SEARCH,
                                         0 };
 
 const arg_def_t *main_args[] = { &g_av1_codec_arg_defs.help,
@@ -422,10 +426,17 @@
 #if CONFIG_TUNE_VMAF
   &g_av1_codec_arg_defs.vmaf_model_path,
 #endif
+  &g_av1_codec_arg_defs.dv_cost_upd_freq,
+  &g_av1_codec_arg_defs.partition_info_path,
+  &g_av1_codec_arg_defs.enable_directional_intra,
+  &g_av1_codec_arg_defs.enable_tx_size_search,
   NULL,
 };
 
 const arg_def_t *av1_key_val_args[] = {
+  &g_av1_codec_arg_defs.passes,
+  &g_av1_codec_arg_defs.two_pass_output,
+  &g_av1_codec_arg_defs.fwd_kf_dist,
   NULL,
 };
 
@@ -505,7 +516,12 @@
 #if CONFIG_TUNE_VMAF
   const char *vmaf_model_path;
 #endif
+  const char *partition_info_path;
   aom_color_range_t color_range;
+  const char *two_pass_input;
+  const char *two_pass_output;
+  int two_pass_width;
+  int two_pass_height;
 };
 
 struct stream_state {
@@ -530,6 +546,12 @@
   int mismatch_seen;
   unsigned int chroma_subsampling_x;
   unsigned int chroma_subsampling_y;
+  const char *orig_out_fn;
+  unsigned int orig_width;
+  unsigned int orig_height;
+  int orig_write_webm;
+  int orig_write_ivf;
+  char tmp_out_fn[40];
 };
 
 static void validate_positive_rational(const char *msg,
@@ -591,12 +613,12 @@
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.passes, argi)) {
       global->passes = arg_parse_uint(&arg);
 
-      if (global->passes < 1 || global->passes > 2)
+      if (global->passes < 1 || global->passes > 3)
         die("Error: Invalid number of passes (%d)\n", global->passes);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.pass_arg, argi)) {
       global->pass = arg_parse_uint(&arg);
 
-      if (global->pass < 1 || global->pass > 2)
+      if (global->pass < 1 || global->pass > 3)
         die("Error: Invalid pass selected (%d)\n", global->pass);
     } else if (arg_match(&arg,
                          &g_av1_codec_arg_defs.input_chroma_sample_position,
@@ -658,8 +680,8 @@
   if (global->pass) {
     /* DWIM: Assume the user meant passes=2 if pass=2 is specified */
     if (global->pass > global->passes) {
-      warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
-           global->pass);
+      aom_tools_warn("Assuming --pass=%d implies --passes=%d\n", global->pass,
+                     global->pass);
       global->passes = global->pass;
     }
   }
@@ -680,12 +702,14 @@
   }
 
   if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
-    warn("Enforcing one-pass encoding in realtime mode\n");
+    aom_tools_warn("Enforcing one-pass encoding in realtime mode\n");
+    if (global->pass > 1)
+      die("Error: Invalid --pass=%d for one-pass encoding\n", global->pass);
     global->passes = 1;
   }
 
   if (global->usage == AOM_USAGE_ALL_INTRA && global->passes > 1) {
-    warn("Enforcing one-pass encoding in all intra mode\n");
+    aom_tools_warn("Enforcing one-pass encoding in all intra mode\n");
     global->passes = 1;
   }
 }
@@ -794,6 +818,10 @@
 
   /* Output files must be specified for each stream */
   stream->config.out_fn = NULL;
+  stream->config.two_pass_input = NULL;
+  stream->config.two_pass_output = NULL;
+  stream->config.two_pass_width = 0;
+  stream->config.two_pass_height = 0;
 
   stream->next = NULL;
   return stream;
@@ -830,7 +858,8 @@
   config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
 
   if (key == AOME_SET_ENABLEAUTOALTREF && config->arg_ctrls[j][1] > 1) {
-    warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+    aom_tools_warn(
+        "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
     config->arg_ctrls[j][1] = 1;
   }
 
@@ -866,7 +895,8 @@
   if (strcmp(name, g_av1_codec_arg_defs.auto_altref.long_name) == 0) {
     int auto_altref = arg_parse_int(arg);
     if (auto_altref > 1) {
-      warn("auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
+      aom_tools_warn(
+          "auto-alt-ref > 1 is deprecated... setting auto-alt-ref=1\n");
       config->arg_key_vals[j][1] = "1";
     }
   }
@@ -990,7 +1020,8 @@
                          argi)) {
       config->use_16bit_internal = CONFIG_AV1_HIGHBITDEPTH;
       if (!config->use_16bit_internal) {
-        warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n", arg.name);
+        aom_tools_warn("%s option ignored with CONFIG_AV1_HIGHBITDEPTH=0.\n",
+                       arg.name);
       }
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.dropframe_thresh, argi)) {
       config->cfg.rc_dropframe_thresh = arg_parse_uint(&arg);
@@ -1036,17 +1067,17 @@
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.bias_pct, argi)) {
       config->cfg.rc_2pass_vbr_bias_pct = arg_parse_uint(&arg);
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.minsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_minsection_pct = arg_parse_uint(&arg);
 
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.maxsection_pct, argi)) {
       config->cfg.rc_2pass_vbr_maxsection_pct = arg_parse_uint(&arg);
 
       if (global->passes < 2)
-        warn("option %s ignored in one-pass mode.\n", arg.name);
+        aom_tools_warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.fwd_kf_enabled, argi)) {
       config->cfg.fwd_kf_enabled = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.kf_min_dist, argi)) {
@@ -1071,6 +1102,9 @@
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argi)) {
       config->vmaf_model_path = arg.val;
 #endif
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                         argi)) {
+      config->partition_info_path = arg.val;
     } else if (arg_match(&arg, &g_av1_codec_arg_defs.use_fixed_qp_offsets,
                          argi)) {
       config->cfg.use_fixed_qp_offsets = arg_parse_uint(&arg);
@@ -1087,8 +1121,17 @@
                arg_match(&arg, &g_av1_codec_arg_defs.enable_restoration,
                          argi)) {
       if (arg_parse_uint(&arg) == 1) {
-        warn("non-zero %s option ignored in realtime mode.\n", arg.name);
+        aom_tools_warn("non-zero %s option ignored in realtime mode.\n",
+                       arg.name);
       }
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_input, argi)) {
+      config->two_pass_input = arg.val;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_output, argi)) {
+      config->two_pass_output = arg.val;
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_width, argi)) {
+      config->two_pass_width = arg_parse_int(&arg);
+    } else if (arg_match(&arg, &g_av1_codec_arg_defs.two_pass_height, argi)) {
+      config->two_pass_height = arg_parse_int(&arg);
     } else {
       int i, match = 0;
       // check if the control ID API supports this arg
@@ -1117,22 +1160,50 @@
   config->use_16bit_internal |= config->cfg.g_bit_depth > AOM_BITS_8;
 
   if (global->usage == AOM_USAGE_REALTIME && config->cfg.g_lag_in_frames != 0) {
-    warn("non-zero lag-in-frames option ignored in realtime mode.\n");
+    aom_tools_warn("non-zero lag-in-frames option ignored in realtime mode.\n");
     config->cfg.g_lag_in_frames = 0;
   }
 
   if (global->usage == AOM_USAGE_ALL_INTRA) {
     if (config->cfg.g_lag_in_frames != 0) {
-      warn("non-zero lag-in-frames option ignored in all intra mode.\n");
+      aom_tools_warn(
+          "non-zero lag-in-frames option ignored in all intra mode.\n");
       config->cfg.g_lag_in_frames = 0;
     }
     if (config->cfg.kf_max_dist != 0) {
-      warn(
+      aom_tools_warn(
           "non-zero max key frame distance option ignored in all intra "
           "mode.\n");
       config->cfg.kf_max_dist = 0;
     }
   }
+
+  // set the passes field using key & val API
+  if (config->arg_key_val_cnt >= ARG_KEY_VAL_CNT_MAX) {
+    die("Not enough buffer for the key & value API.");
+  }
+  config->arg_key_vals[config->arg_key_val_cnt][0] = "passes";
+  switch (global->passes) {
+    case 0: config->arg_key_vals[config->arg_key_val_cnt][1] = "0"; break;
+    case 1: config->arg_key_vals[config->arg_key_val_cnt][1] = "1"; break;
+    case 2: config->arg_key_vals[config->arg_key_val_cnt][1] = "2"; break;
+    case 3: config->arg_key_vals[config->arg_key_val_cnt][1] = "3"; break;
+    default: die("Invalid value of --passes.");
+  }
+  config->arg_key_val_cnt++;
+
+  // set the two_pass_output field
+  if (!config->two_pass_output && global->passes == 3) {
+    snprintf(stream->tmp_out_fn, sizeof(stream->tmp_out_fn),
+             "tmp_2pass_output_%d.ivf", stream->index);
+    stream->config.two_pass_output = stream->tmp_out_fn;
+  }
+  if (config->two_pass_output) {
+    config->arg_key_vals[config->arg_key_val_cnt][0] = "two-pass-output";
+    config->arg_key_vals[config->arg_key_val_cnt][1] = config->two_pass_output;
+    config->arg_key_val_cnt++;
+  }
+
   return eos_mark_found;
 }
 
@@ -1387,9 +1458,17 @@
       fatal("Failed to open statistics store");
   }
 
-  stream->config.cfg.g_pass = global->passes == 2
-                                  ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
-                                  : AOM_RC_ONE_PASS;
+  if (global->passes == 1) {
+    stream->config.cfg.g_pass = AOM_RC_ONE_PASS;
+  } else {
+    switch (pass) {
+      case 0: stream->config.cfg.g_pass = AOM_RC_FIRST_PASS; break;
+      case 1: stream->config.cfg.g_pass = AOM_RC_SECOND_PASS; break;
+      case 2: stream->config.cfg.g_pass = AOM_RC_THIRD_PASS; break;
+      default: fatal("Failed to set pass");
+    }
+  }
+
   if (pass) {
     stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
   }
@@ -1436,6 +1515,11 @@
                                   stream->config.vmaf_model_path);
   }
 #endif
+  if (stream->config.partition_info_path) {
+    AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder,
+                                  AV1E_SET_PARTITION_INFO_PATH,
+                                  stream->config.partition_info_path);
+  }
 
   if (stream->config.film_grain_filename) {
     AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE,
@@ -1472,6 +1556,33 @@
 #endif
 }
 
+// Convert the input image 'img' to a monochrome image. The Y plane of the
+// output image is a shallow copy of the Y plane of the input image, therefore
+// the input image must remain valid for the lifetime of the output image. The U
+// and V planes of the output image are set to null pointers. The output image
+// format is AOM_IMG_FMT_I420 because libaom does not have AOM_IMG_FMT_I400.
+static void convert_image_to_monochrome(const struct aom_image *img,
+                                        struct aom_image *monochrome_img) {
+  *monochrome_img = *img;
+  monochrome_img->fmt = AOM_IMG_FMT_I420;
+  if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+    monochrome_img->fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+  }
+  monochrome_img->monochrome = 1;
+  monochrome_img->csp = AOM_CSP_UNKNOWN;
+  monochrome_img->x_chroma_shift = 1;
+  monochrome_img->y_chroma_shift = 1;
+  monochrome_img->planes[AOM_PLANE_U] = NULL;
+  monochrome_img->planes[AOM_PLANE_V] = NULL;
+  monochrome_img->stride[AOM_PLANE_U] = 0;
+  monochrome_img->stride[AOM_PLANE_V] = 0;
+  monochrome_img->sz = 0;
+  monochrome_img->bps = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 16 : 8;
+  monochrome_img->img_data = NULL;
+  monochrome_img->img_data_owner = 0;
+  monochrome_img->self_allocd = 0;
+}
+
 static void encode_frame(struct stream_state *stream,
                          struct AvxEncoderConfig *global, struct aom_image *img,
                          unsigned int frames_in) {
@@ -1551,6 +1662,12 @@
 #endif
   }
 
+  struct aom_image monochrome_img;
+  if (img && cfg->monochrome) {
+    convert_image_to_monochrome(img, &monochrome_img);
+    img = &monochrome_img;
+  }
+
   aom_usec_timer_start(&timer);
   aom_codec_encode(&stream->encoder, img, frame_start,
                    (uint32_t)(next_frame_start - frame_start), 0);
@@ -1797,6 +1914,30 @@
   }
 }
 
+static void clear_stream_count_state(struct stream_state *stream) {
+  // PSNR counters
+  for (int k = 0; k < 2; k++) {
+    stream->psnr_sse_total[k] = 0;
+    stream->psnr_samples_total[k] = 0;
+    for (int i = 0; i < 4; i++) {
+      stream->psnr_totals[k][i] = 0;
+    }
+    stream->psnr_count[k] = 0;
+  }
+  // q hist
+  memset(stream->counts, 0, sizeof(stream->counts));
+}
+
+// aomenc will downscale the second pass if:
+// 1. the specific pass is not given by commandline (aomenc will perform all
+//    passes)
+// 2. there are more than 2 passes in total
+// 3. current pass is the second pass (the parameter pass starts with 0 so
+//    pass == 1)
+static int pass_need_downscale(int global_pass, int global_passes, int pass) {
+  return !global_pass && global_passes > 2 && pass == 1;
+}
+
 int main(int argc, const char **argv_) {
   int pass;
   aom_image_t raw;
@@ -1871,6 +2012,14 @@
 
   /* Handle non-option arguments */
   input.filename = argv[0];
+  const char *orig_input_filename = input.filename;
+  FOREACH_STREAM(stream, streams) {
+    stream->orig_out_fn = stream->config.out_fn;
+    stream->orig_width = stream->config.cfg.g_w;
+    stream->orig_height = stream->config.cfg.g_h;
+    stream->orig_write_ivf = stream->config.write_ivf;
+    stream->orig_write_webm = stream->config.write_webm;
+  }
 
   if (!input.filename) {
     fprintf(stderr, "No input file specified!\n");
@@ -1882,10 +2031,48 @@
     input.only_i420 = 0;
 
   for (pass = global.pass ? global.pass - 1 : 0; pass < global.passes; pass++) {
+    if (pass > 1) {
+      FOREACH_STREAM(stream, streams) { clear_stream_count_state(stream); }
+    }
+
     int frames_in = 0, seen_frames = 0;
     int64_t estimated_time_left = -1;
     int64_t average_rate = -1;
     int64_t lagged_count = 0;
+    const int need_downscale =
+        pass_need_downscale(global.pass, global.passes, pass);
+
+    // Set the output to the specified two-pass output file, and
+    // restore the width and height to the original values.
+    FOREACH_STREAM(stream, streams) {
+      if (need_downscale) {
+        stream->config.out_fn = stream->config.two_pass_output;
+        // Libaom currently only supports the ivf format for the third pass.
+        stream->config.write_ivf = 1;
+        stream->config.write_webm = 0;
+      } else {
+        stream->config.out_fn = stream->orig_out_fn;
+        stream->config.write_ivf = stream->orig_write_ivf;
+        stream->config.write_webm = stream->orig_write_webm;
+      }
+      stream->config.cfg.g_w = stream->orig_width;
+      stream->config.cfg.g_h = stream->orig_height;
+    }
+
+    // For second pass in three-pass encoding, set the input to
+    // the given two-pass-input file if available. If the scaled input is not
+    // given, we will attempt to re-scale the original input.
+    input.filename = orig_input_filename;
+    const char *two_pass_input = NULL;
+    if (need_downscale) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.two_pass_input) {
+          two_pass_input = stream->config.two_pass_input;
+          input.filename = two_pass_input;
+          break;
+        }
+      }
+    }
 
     open_input_file(&input, global.csp);
 
@@ -1893,20 +2080,55 @@
      * the data from the first stream's configuration.
      */
     if (!input.width || !input.height) {
-      FOREACH_STREAM(stream, streams) {
-        if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-          input.width = stream->config.cfg.g_w;
-          input.height = stream->config.cfg.g_h;
-          break;
+      if (two_pass_input) {
+        FOREACH_STREAM(stream, streams) {
+          if (stream->config.two_pass_width && stream->config.two_pass_height) {
+            input.width = stream->config.two_pass_width;
+            input.height = stream->config.two_pass_height;
+            break;
+          }
         }
-      };
+      } else {
+        FOREACH_STREAM(stream, streams) {
+          if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
+            input.width = stream->config.cfg.g_w;
+            input.height = stream->config.cfg.g_h;
+            break;
+          }
+        }
+      }
     }
 
     /* Update stream configurations from the input file's parameters */
-    if (!input.width || !input.height)
-      fatal(
-          "Specify stream dimensions with --width (-w) "
-          " and --height (-h)");
+    if (!input.width || !input.height) {
+      if (two_pass_input) {
+        fatal(
+            "Specify downscaled stream dimensions with --two-pass-width "
+            " and --two-pass-height");
+      } else {
+        fatal(
+            "Specify stream dimensions with --width (-w) "
+            " and --height (-h)");
+      }
+    }
+
+    if (need_downscale) {
+      FOREACH_STREAM(stream, streams) {
+        if (stream->config.two_pass_width && stream->config.two_pass_height) {
+          stream->config.cfg.g_w = stream->config.two_pass_width;
+          stream->config.cfg.g_h = stream->config.two_pass_height;
+        } else if (two_pass_input) {
+          stream->config.cfg.g_w = input.width;
+          stream->config.cfg.g_h = input.height;
+        } else if (stream->orig_width && stream->orig_height) {
+          stream->config.cfg.g_w = (stream->orig_width + 1) / 2;
+          stream->config.cfg.g_h = (stream->orig_height + 1) / 2;
+        } else {
+          stream->config.cfg.g_w = (input.width + 1) / 2;
+          stream->config.cfg.g_h = (input.height + 1) / 2;
+        }
+      }
+    }
 
     /* If input file does not specify bit-depth but input-bit-depth parameter
      * exists, assume that to be the input bit-depth. However, if the
@@ -1940,8 +2162,10 @@
                 stream->config.cfg.g_profile = 1;
                 profile_updated = 1;
               }
-            } else if (input.bit_depth == 12 || input.fmt == AOM_IMG_FMT_I422 ||
-                       input.fmt == AOM_IMG_FMT_I42216) {
+            } else if (input.bit_depth == 12 ||
+                       ((input.fmt == AOM_IMG_FMT_I422 ||
+                         input.fmt == AOM_IMG_FMT_I42216) &&
+                        !stream->config.cfg.monochrome)) {
               stream->config.cfg.g_profile = 2;
               profile_updated = 1;
             }
@@ -2052,14 +2276,14 @@
     FOREACH_STREAM(stream, streams) { validate_stream_config(stream, &global); }
 
     /* Ensure that --passes and --pass are consistent. If --pass is set and
-     * --passes=2, ensure --fpf was set.
+     * --passes >= 2, ensure --fpf was set.
      */
-    if (global.pass && global.passes == 2) {
+    if (global.pass > 0 && global.pass <= 3 && global.passes >= 2) {
       FOREACH_STREAM(stream, streams) {
         if (!stream->config.stats_fn)
           die("Stream %d: Must specify --fpf when --pass=%d"
-              " and --passes=2\n",
-              stream->index, global.pass);
+              " and --passes=%d\n",
+              stream->index, global.pass, global.passes);
       }
     }
 
@@ -2068,7 +2292,7 @@
       if (stream->config.write_webm) {
         stream->config.write_webm = 0;
         stream->config.write_ivf = 0;
-        warn("aomenc compiled w/o WebM support. Writing OBU stream.");
+        aom_tools_warn("aomenc compiled w/o WebM support. Writing OBU stream.");
       }
     }
 #endif
@@ -2170,6 +2394,11 @@
                   cx_time > 9999999 ? "ms" : "us", fps >= 1.0 ? fps : fps * 60,
                   fps >= 1.0 ? "fps" : "fpm");
           print_time("ETA", estimated_time_left);
+          // mingw-w64 gcc does not match msvc for stderr buffering behavior
+          // and uses line buffering, thus the progress output is not
+          // real-time. The fflush() is here to make sure the progress output
+          // is sent out while the clip is being processed.
+          fflush(stderr);
         }
 
       } else {
@@ -2275,6 +2504,8 @@
                                           : stream->cx_time,
                 stream->cx_time > 9999999 ? "ms" : "us",
                 usec_to_fps(stream->cx_time, seen_frames));
+        // This instance of cr does not need fflush as it is followed by a
+        // newline in the same string.
       }
     }
 

diff --git a/av1/arg_defs.c b/av1/arg_defs.c
index e79f9b2..327b664 100644
--- a/av1/arg_defs.c
+++ b/av1/arg_defs.c

@@ -145,8 +145,8 @@
   .use_i422 = ARG_DEF(NULL, "i422", 0, "Input file is I422"),
   .use_i444 = ARG_DEF(NULL, "i444", 0, "Input file is I444"),
   .codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use"),
-  .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2)"),
-  .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"),
+  .passes = ARG_DEF("p", "passes", 1, "Number of passes (1/2/3)"),
+  .pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2/3)"),
   .fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"),
   .limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"),
   .skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"),
@@ -271,7 +271,9 @@
   .noise_sens = ARG_DEF(NULL, "noise-sensitivity", 1,
                         "Noise sensitivity (frames to blur)"),
   .sharpness = ARG_DEF(NULL, "sharpness", 1,
-                       "Loop filter sharpness (0..7), default is 0"),
+                       "Bias towards block sharpness in rate-distortion "
+                       "optimization of transform coefficients "
+                       "(0..7), default is 0"),
   .static_thresh =
       ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold"),
   .auto_altref =
@@ -405,10 +407,15 @@
   .enable_cfl_intra = ARG_DEF(NULL, "enable-cfl-intra", 1,
                               "Enable chroma from luma intra prediction mode "
                               "(0: false, 1: true (default))"),
+  .enable_directional_intra =
+      ARG_DEF(NULL, "enable-directional-intra", 1,
+              "Enable directional intra prediction modes "
+              "(0: false, 1: true (default))"),
   .enable_diagonal_intra =
       ARG_DEF(NULL, "enable-diagonal-intra", 1,
-              "Enable diagonal (D45 to D203) intra prediction modes "
-              "(0: false, 1: true (default))"),
+              "Enable diagonal (D45 to D203) intra prediction modes, which are "
+              "a subset of directional modes. Has no effect if "
+              "enable-directional-intra is 0 (0: false, 1: true (default))"),
   .force_video_mode = ARG_DEF(NULL, "force-video-mode", 1,
                               "Force video mode (0: false, 1: true (default))"),
   .enable_obmc = ARG_DEF(NULL, "enable-obmc", 1,
@@ -448,13 +455,16 @@
               "Use Default-transform only for INTRA modes"),
   .quant_b_adapt = ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"),
   .coeff_cost_upd_freq = ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
-                                 "Update freq for coeff costs"
+                                 "Update freq for coeff costs. "
                                  "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .mode_cost_upd_freq = ARG_DEF(NULL, "mode-cost-upd-freq", 1,
-                                "Update freq for mode costs"
+                                "Update freq for mode costs. "
                                 "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .mv_cost_upd_freq = ARG_DEF(NULL, "mv-cost-upd-freq", 1,
-                              "Update freq for mv costs"
+                              "Update freq for mv costs. "
+                              "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
+  .dv_cost_upd_freq = ARG_DEF(NULL, "dv-cost-upd-freq", 1,
+                              "Update freq for dv costs. "
                               "0: SB, 1: SB Row per Tile, 2: Tile, 3: Off"),
   .num_tg = ARG_DEF(NULL, "num-tile-groups", 1,
                     "Maximum number of tile groups, default is 1"),
@@ -471,6 +481,8 @@
   .vmaf_model_path =
       ARG_DEF(NULL, "vmaf-model-path", 1, "Path to the VMAF model file"),
 #endif
+  .partition_info_path = ARG_DEF(NULL, "partition-info-path", 1,
+                                 "Partition information read and write path"),
   .film_grain_test = ARG_DEF(
       NULL, "film-grain-test", 1,
       "Film grain test vectors (0: none (default), 1: test-1  2: test-2, "
@@ -505,7 +517,8 @@
   .deltaq_mode =
       ARG_DEF(NULL, "deltaq-mode", 1,
               "Delta qindex mode (0: off, 1: deltaq objective (default), "
-              "2: deltaq perceptual). "
+              "2: deltaq placeholder, 3: key frame visual quality, 4: user "
+              "rating based visual quality optimization). "
               "Currently this requires enable-tpl-model as a prerequisite."),
   .deltalf_mode = ARG_DEF(NULL, "delta-lf-mode", 1,
                           "Enable delta-lf-mode (0: off (default), 1: on)"),
@@ -605,6 +618,28 @@
   .vbr_corpus_complexity_lap = ARG_DEF(
       NULL, "vbr-corpus-complexity-lap", 1,
       "Set average corpus complexity per mb for single pass VBR using lap. "
-      "(0..10000), default is 0")
+      "(0..10000), default is 0"),
+
+  .fwd_kf_dist =
+      ARG_DEF(NULL, "fwd-kf-dist", -1,
+              "Set distance between forward keyframes. A value of -1 means no "
+              "repetitive forward keyframes. Default is -1."),
+
+  .enable_tx_size_search = ARG_DEF(
+      NULL, "enable-tx-size-search", 1,
+      "Enable transform size search to find the best size for each block. "
+      "If false, transforms always have the largest possible size "
+      "(0: false, 1: true (default))"),
+
+  .two_pass_input =
+      ARG_DEF(NULL, "two-pass-input", 1,
+              "The input file for the second pass for three-pass encoding."),
+  .two_pass_output = ARG_DEF(
+      NULL, "two-pass-output", 1,
+      "The output file for the first two passes for three-pass encoding."),
+  .two_pass_width =
+      ARG_DEF(NULL, "two-pass-width", 1, "The width of two-pass-input."),
+  .two_pass_height =
+      ARG_DEF(NULL, "two-pass-height", 1, "The height of two-pass-input."),
 #endif  // CONFIG_AV1_ENCODER
 };

diff --git a/av1/arg_defs.h b/av1/arg_defs.h
index f86e915..c89dbba 100644
--- a/av1/arg_defs.h
+++ b/av1/arg_defs.h

@@ -154,6 +154,7 @@
   arg_def_t enable_smooth_intra;
   arg_def_t enable_paeth_intra;
   arg_def_t enable_cfl_intra;
+  arg_def_t enable_directional_intra;
   arg_def_t enable_diagonal_intra;
   arg_def_t force_video_mode;
   arg_def_t enable_obmc;
@@ -173,12 +174,14 @@
   arg_def_t coeff_cost_upd_freq;
   arg_def_t mode_cost_upd_freq;
   arg_def_t mv_cost_upd_freq;
+  arg_def_t dv_cost_upd_freq;
   arg_def_t num_tg;
   arg_def_t mtu_size;
   arg_def_t timing_info;
 #if CONFIG_TUNE_VMAF
   arg_def_t vmaf_model_path;
 #endif
+  arg_def_t partition_info_path;
   arg_def_t film_grain_test;
   arg_def_t film_grain_table;
 #if CONFIG_DENOISE
@@ -214,6 +217,12 @@
   arg_def_t use_fixed_qp_offsets;
   arg_def_t fixed_qp_offsets;
   arg_def_t vbr_corpus_complexity_lap;
+  arg_def_t fwd_kf_dist;
+  arg_def_t enable_tx_size_search;
+  arg_def_t two_pass_input;
+  arg_def_t two_pass_output;
+  arg_def_t two_pass_width;
+  arg_def_t two_pass_height;
 #endif  // CONFIG_AV1_ENCODER
 } av1_codec_arg_definitions_t;
 

diff --git a/av1/av1.cmake b/av1/av1.cmake
index 3eb4a1f..bcffa76 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake

@@ -60,8 +60,6 @@
             "${AOM_ROOT}/av1/common/mvref_common.h"
             "${AOM_ROOT}/av1/common/obu_util.c"
             "${AOM_ROOT}/av1/common/obu_util.h"
-            "${AOM_ROOT}/av1/common/odintrin.c"
-            "${AOM_ROOT}/av1/common/odintrin.h"
             "${AOM_ROOT}/av1/common/pred_common.c"
             "${AOM_ROOT}/av1/common/pred_common.h"
             "${AOM_ROOT}/av1/common/quant_common.c"
@@ -99,10 +97,6 @@
                    "${AOM_ROOT}/av1/common/warped_motion.h")
 endif()
 
-if(CONFIG_LPF_MASK)
-  list(APPEND AOM_AV1_COMMON_SOURCES "${AOM_ROOT}/av1/common/loopfiltermask.c")
-endif()
-
 list(APPEND AOM_AV1_DECODER_SOURCES
             "${AOM_ROOT}/av1/av1_dx_iface.c"
             "${AOM_ROOT}/av1/decoder/decodeframe.c"
@@ -127,6 +121,8 @@
             "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
             "${AOM_ROOT}/av1/encoder/aq_variance.c"
             "${AOM_ROOT}/av1/encoder/aq_variance.h"
+            "${AOM_ROOT}/av1/encoder/allintra_vis.c"
+            "${AOM_ROOT}/av1/encoder/allintra_vis.h"
             "${AOM_ROOT}/av1/encoder/enc_enums.h"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
             "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
@@ -170,6 +166,8 @@
             "${AOM_ROOT}/av1/encoder/ethread.h"
             "${AOM_ROOT}/av1/encoder/extend.c"
             "${AOM_ROOT}/av1/encoder/extend.h"
+            "${AOM_ROOT}/av1/encoder/external_partition.c"
+            "${AOM_ROOT}/av1/encoder/external_partition.h"
             "${AOM_ROOT}/av1/encoder/firstpass.c"
             "${AOM_ROOT}/av1/encoder/firstpass.h"
             "${AOM_ROOT}/av1/encoder/global_motion.c"
@@ -230,6 +228,7 @@
             "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
             "${AOM_ROOT}/av1/encoder/segmentation.c"
             "${AOM_ROOT}/av1/encoder/segmentation.h"
+            "${AOM_ROOT}/av1/encoder/sorting_network.h"
             "${AOM_ROOT}/av1/encoder/speed_features.c"
             "${AOM_ROOT}/av1/encoder/speed_features.h"
             "${AOM_ROOT}/av1/encoder/superres_scale.c"
@@ -238,6 +237,8 @@
             "${AOM_ROOT}/av1/encoder/svc_layercontext.h"
             "${AOM_ROOT}/av1/encoder/temporal_filter.c"
             "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+            "${AOM_ROOT}/av1/encoder/thirdpass.c"
+            "${AOM_ROOT}/av1/encoder/thirdpass.h"
             "${AOM_ROOT}/av1/encoder/tokenize.c"
             "${AOM_ROOT}/av1/encoder/tokenize.h"
             "${AOM_ROOT}/av1/encoder/tpl_model.c"
@@ -391,12 +392,14 @@
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/error_intrin_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
-            "${AOM_ROOT}/av1/encoder/x86/av1_k_means_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_sse2.c"
+            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse2.c"
             "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
 if(CONFIG_AV1_TEMPORAL_DENOISING)
@@ -413,6 +416,9 @@
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/av1/encoder/x86/ml_sse3.c")
 
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSSE3
+            "${AOM_ROOT}/av1/encoder/x86/reconinter_enc_ssse3.c")
+
 list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
             "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
 
@@ -489,8 +495,6 @@
             "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
             "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
             "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
-            "${AOM_ROOT}/av1/common/arm/mem_neon.h"
-            "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
             "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
@@ -622,6 +626,12 @@
                                       "AOM_AV1_DECODER_INTRIN_SSSE3")
       endif()
     endif()
+    if(CONFIG_AV1_ENCODER)
+      if(AOM_AV1_ENCODER_INTRIN_SSSE3)
+        add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSSE3")
+      endif()
+    endif()
   endif()
 
   if(HAVE_SSE4_1)

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index e394cf3..62ea28d 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c

@@ -17,7 +17,6 @@
 
 #include "aom_ports/aom_once.h"
 #include "aom_ports/mem_ops.h"
-#include "aom_ports/system_state.h"
 
 #include "aom/aom_encoder.h"
 #include "aom/internal/aom_codec_internal.h"
@@ -26,6 +25,7 @@
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/ethread.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/arg_defs.h"
 
@@ -51,6 +51,7 @@
   unsigned int gf_max_pyr_height;
   aom_tune_metric tuning;
   const char *vmaf_model_path;
+  const char *partition_info_path;
   unsigned int cq_level;  // constrained quality level
   unsigned int rc_max_intra_bitrate_pct;
   unsigned int rc_max_inter_bitrate_pct;
@@ -125,6 +126,7 @@
   int enable_smooth_intra;           // enable smooth intra modes for sequence
   int enable_paeth_intra;            // enable Paeth intra mode for sequence
   int enable_cfl_intra;              // enable CFL uv intra mode for sequence
+  int enable_directional_intra;      // enable directional modes for sequence
   int enable_diagonal_intra;  // enable D45 to D203 intra modes for sequence
   int enable_superres;
   int enable_overlay;  // enable overlay for filtered arf frames
@@ -143,6 +145,7 @@
   int use_intra_dct_only;
   int use_inter_dct_only;
   int use_intra_default_tx_only;
+  int enable_tx_size_search;
   int quant_b_adapt;
   unsigned int vbr_corpus_complexity_lap;
   AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
@@ -154,11 +157,167 @@
   COST_UPDATE_TYPE coeff_cost_upd_freq;
   COST_UPDATE_TYPE mode_cost_upd_freq;
   COST_UPDATE_TYPE mv_cost_upd_freq;
+  COST_UPDATE_TYPE dv_cost_upd_freq;
   unsigned int ext_tile_debug;
   unsigned int sb_multipass_unit_test;
+  // Total number of passes. If this number is -1, then we assume passes = 1 or
+  // 2 (passes = 1 if pass == AOM_RC_ONE_PASS and passes = 2 otherwise).
+  int passes;
+  int fwd_kf_dist;
+  // the name of the second pass output file when passes > 2
+  const char *two_pass_output;
 };
 
-static struct av1_extracfg default_extra_cfg = {
+#if CONFIG_REALTIME_ONLY
+// Settings changed for realtime only build:
+// cpu_used: 7
+// enable_tpl_model: 0
+// enable_restoration: 0
+// enable_obmc: 0
+// deltaq_mode: NO_DELTA_Q
+// enable_global_motion usage: 0
+// enable_warped_motion at sequence level: 0
+// allow_warped_motion at frame level: 0
+// coeff_cost_upd_freq: COST_UPD_OFF
+// mode_cost_upd_freq: COST_UPD_OFF
+// mv_cost_upd_freq: COST_UPD_OFF
+static const struct av1_extracfg default_extra_cfg = {
+  7,              // cpu_used
+  1,              // enable_auto_alt_ref
+  0,              // enable_auto_bwd_ref
+  0,              // noise_sensitivity
+  0,              // sharpness
+  0,              // static_thresh
+  1,              // row_mt
+  0,              // tile_columns
+  0,              // tile_rows
+  0,              // enable_tpl_model
+  1,              // enable_keyframe_filtering
+  7,              // arnr_max_frames
+  5,              // arnr_strength
+  0,              // min_gf_interval; 0 -> default decision
+  0,              // max_gf_interval; 0 -> default decision
+  0,              // gf_min_pyr_height
+  5,              // gf_max_pyr_height
+  AOM_TUNE_PSNR,  // tuning
+  "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
+  10,                                         // cq_level
+  0,                                          // rc_max_intra_bitrate_pct
+  0,                                          // rc_max_inter_bitrate_pct
+  0,                                          // gf_cbr_boost_pct
+  0,                                          // lossless
+  1,                                          // enable_cdef
+  0,                                          // enable_restoration
+  0,                                          // force_video_mode
+  0,                                          // enable_obmc
+  3,                                          // disable_trellis_quant
+  0,                                          // enable_qm
+  DEFAULT_QM_Y,                               // qm_y
+  DEFAULT_QM_U,                               // qm_u
+  DEFAULT_QM_V,                               // qm_v
+  DEFAULT_QM_FIRST,                           // qm_min
+  DEFAULT_QM_LAST,                            // qm_max
+  1,                                          // max number of tile groups
+  0,                                          // mtu_size
+  AOM_TIMING_UNSPECIFIED,       // No picture timing signaling in bitstream
+  0,                            // frame_parallel_decoding_mode
+  1,                            // enable dual filter
+  0,                            // enable delta quant in chroma planes
+  NO_AQ,                        // aq_mode
+  NO_DELTA_Q,                   // deltaq_mode
+  0,                            // delta lf mode
+  0,                            // frame_periodic_boost
+  AOM_BITS_8,                   // Bit depth
+  AOM_CONTENT_DEFAULT,          // content
+  AOM_CICP_CP_UNSPECIFIED,      // CICP color primaries
+  AOM_CICP_TC_UNSPECIFIED,      // CICP transfer characteristics
+  AOM_CICP_MC_UNSPECIFIED,      // CICP matrix coefficients
+  AOM_CSP_UNKNOWN,              // chroma sample position
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC,  // superblock_size
+  1,                            // this depends on large_scale_tile.
+  0,                            // error_resilient_mode off by default.
+  0,                            // s_frame_mode off by default.
+  0,                            // film_grain_test_vector
+  NULL,                         // film_grain_table_filename
+  0,                            // motion_vector_unit_test
+  1,                            // CDF update mode
+  1,                            // enable rectangular partitions
+  1,                            // enable ab shape partitions
+  1,                            // enable 1:4 and 4:1 partitions
+  4,                            // min_partition_size
+  128,                          // max_partition_size
+  1,                            // enable intra edge filter
+  1,                            // frame order hint
+  1,                            // enable 64-pt transform usage
+  1,                            // enable flip and identity transform
+  1,                            // enable rectangular transform usage
+  1,                            // dist-wtd compound
+  7,                            // max_reference_frames
+  0,                            // enable_reduced_reference_set
+  1,                            // enable_ref_frame_mvs sequence level
+  1,                            // allow ref_frame_mvs frame level
+  1,                            // enable masked compound at sequence level
+  1,                            // enable one sided compound at sequence level
+  1,                            // enable interintra compound at sequence level
+  1,                            // enable smooth interintra mode
+  1,                            // enable difference-weighted compound
+  1,                            // enable interinter wedge compound
+  1,                            // enable interintra wedge compound
+  0,                            // enable_global_motion usage
+  0,                            // enable_warped_motion at sequence level
+  0,                            // allow_warped_motion at frame level
+  1,                            // enable filter intra at sequence level
+  1,                            // enable smooth intra modes usage for sequence
+  1,                            // enable Paeth intra mode usage for sequence
+  1,                            // enable CFL uv intra mode usage for sequence
+  1,                       // enable directional intra mode usage for sequence
+  1,                       // enable D45 to D203 intra mode usage for sequence
+  1,                       // superres
+  1,                       // enable overlay
+  1,                       // enable palette
+  !CONFIG_SHARP_SETTINGS,  // enable intrabc
+  1,                       // enable angle delta
+#if CONFIG_DENOISE
+  0,   // noise_level
+  32,  // noise_block_size
+  1,   // enable_dnl_denoising
+#endif
+  0,  // chroma_subsampling_x
+  0,  // chroma_subsampling_y
+  0,  // reduced_tx_type_set
+  0,  // use_intra_dct_only
+  0,  // use_inter_dct_only
+  0,  // use_intra_default_tx_only
+  1,  // enable_tx_size_search
+  0,  // quant_b_adapt
+  0,  // vbr_corpus_complexity_lap
+  {
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+      SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
+  },             // target_seq_level_idx
+  0,             // tier_mask
+  0,             // min_cr
+  COST_UPD_OFF,  // coeff_cost_upd_freq
+  COST_UPD_OFF,  // mode_cost_upd_freq
+  COST_UPD_OFF,  // mv_cost_upd_freq
+  COST_UPD_OFF,  // dv_cost_upd_freq
+  0,             // ext_tile_debug
+  0,             // sb_multipass_unit_test
+  -1,            // passes
+  -1,            // fwd_kf_dist
+  NULL,          // two_pass_output
+};
+#else
+static const struct av1_extracfg default_extra_cfg = {
   0,              // cpu_used
   1,              // enable_auto_alt_ref
   0,              // enable_auto_bwd_ref
@@ -178,6 +337,7 @@
   5,              // gf_max_pyr_height
   AOM_TUNE_PSNR,  // tuning
   "/usr/local/share/model/vmaf_v0.6.1.json",  // VMAF model path
+  ".",                                        // partition info path
   10,                                         // cq_level
   0,                                          // rc_max_intra_bitrate_pct
   0,                                          // rc_max_inter_bitrate_pct
@@ -218,7 +378,7 @@
   0,                            // error_resilient_mode off by default.
   0,                            // s_frame_mode off by default.
   0,                            // film_grain_test_vector
-  0,                            // film_grain_table_filename
+  NULL,                         // film_grain_table_filename
   0,                            // motion_vector_unit_test
   1,                            // CDF update mode
   1,                            // enable rectangular partitions
@@ -250,6 +410,7 @@
   1,                            // enable smooth intra modes usage for sequence
   1,                            // enable Paeth intra mode usage for sequence
   1,                            // enable CFL uv intra mode usage for sequence
+  1,                       // enable directional intra mode usage for sequence
   1,                       // enable D45 to D203 intra mode usage for sequence
   1,                       // superres
   1,                       // enable overlay
@@ -257,18 +418,19 @@
   !CONFIG_SHARP_SETTINGS,  // enable intrabc
   1,                       // enable angle delta
 #if CONFIG_DENOISE
-  0,   // noise_level
-  32,  // noise_block_size
-  1,   // enable_dnl_denoising
+  0,                       // noise_level
+  32,                      // noise_block_size
+  1,                       // enable_dnl_denoising
 #endif
-  0,  // chroma_subsampling_x
-  0,  // chroma_subsampling_y
-  0,  // reduced_tx_type_set
-  0,  // use_intra_dct_only
-  0,  // use_inter_dct_only
-  0,  // use_intra_default_tx_only
-  0,  // quant_b_adapt
-  0,  // vbr_corpus_complexity_lap
+  0,                       // chroma_subsampling_x
+  0,                       // chroma_subsampling_y
+  0,                       // reduced_tx_type_set
+  0,                       // use_intra_dct_only
+  0,                       // use_inter_dct_only
+  0,                       // use_intra_default_tx_only
+  1,                       // enable_tx_size_search
+  0,                       // quant_b_adapt
+  0,                       // vbr_corpus_complexity_lap
   {
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
       SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX, SEQ_LEVEL_MAX,
@@ -283,9 +445,14 @@
   COST_UPD_SB,  // coeff_cost_upd_freq
   COST_UPD_SB,  // mode_cost_upd_freq
   COST_UPD_SB,  // mv_cost_upd_freq
+  COST_UPD_SB,  // dv_cost_upd_freq
   0,            // ext_tile_debug
   0,            // sb_multipass_unit_test
+  -1,           // passes
+  -1,           // fwd_kf_dist
+  NULL,         // two_pass_output
 };
+#endif
 
 struct aom_codec_alg_priv {
   aom_codec_priv_t base;
@@ -341,6 +508,41 @@
   return res;
 }
 
+// This function deep copies a string src to *dst. For default string we will
+// use a string literal, and otherwise we will allocate memory for the string.
+static aom_codec_err_t allocate_and_set_string(const char *src,
+                                               const char *default_src,
+                                               const char **dst,
+                                               char *err_detail) {
+  if (!src) {
+    snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+             "Null pointer given to a string parameter.");
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (*dst && strcmp(src, *dst) == 0) return AOM_CODEC_OK;
+  // If the input is exactly the same as default, we will use the string
+  // literal, so do not free here.
+  if (*dst != default_src) {
+    aom_free((void *)*dst);
+  }
+
+  if (default_src && strcmp(src, default_src) == 0) {
+    // default_src should be a string literal
+    *dst = default_src;
+  } else {
+    size_t len = strlen(src) + 1;
+    char *tmp = aom_malloc(len * sizeof(*tmp));
+    if (!tmp) {
+      snprintf(err_detail, ARG_ERR_MSG_MAX_LEN,
+               "Failed to allocate memory for copying parameters.");
+      return AOM_CODEC_MEM_ERROR;
+    }
+    memcpy(tmp, src, len);
+    *dst = tmp;
+  }
+  return 0;
+}
+
 #undef ERROR
 #define ERROR(str)                  \
   do {                              \
@@ -380,7 +582,11 @@
   RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTA_Q_MODE_COUNT - 1);
   RANGE_CHECK_HI(extra_cfg, deltalf_mode, 1);
   RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
-  RANGE_CHECK_HI(cfg, g_usage, 2);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_usage, AOM_USAGE_REALTIME, AOM_USAGE_REALTIME);
+#else
+  RANGE_CHECK_HI(cfg, g_usage, AOM_USAGE_ALL_INTRA);
+#endif
   RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
@@ -388,7 +594,7 @@
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
-  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+  RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_THIRD_PASS);
   if (cfg->g_pass == AOM_RC_ONE_PASS) {
     RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_TOTAL_BUFFERS);
   } else {
@@ -431,7 +637,8 @@
   RANGE_CHECK_HI(extra_cfg, ext_tile_debug, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 1);
   RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, 0, 9);
+  RANGE_CHECK(extra_cfg, cpu_used, 0,
+              (cfg->g_usage == AOM_USAGE_REALTIME) ? 10 : 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
               AOM_SUPERBLOCK_SIZE_DYNAMIC);
@@ -458,7 +665,7 @@
   RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
 
-  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+  if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     const FIRSTPASS_STATS *stats;
@@ -479,6 +686,15 @@
       ERROR("rc_twopass_stats_in missing EOS stats packet");
   }
 
+  if (extra_cfg->passes != -1 && cfg->g_pass == AOM_RC_ONE_PASS &&
+      extra_cfg->passes != 1) {
+    ERROR("One pass encoding but passes != 1.");
+  }
+
+  if (extra_cfg->passes != -1 && (int)cfg->g_pass > extra_cfg->passes) {
+    ERROR("Current pass is larger than total number of passes.");
+  }
+
   if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
       cfg->g_bit_depth > AOM_BITS_10) {
     ERROR("Codec bit-depth 12 not supported in profile < 2");
@@ -563,6 +779,7 @@
   RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 3);
   RANGE_CHECK(extra_cfg, mv_cost_upd_freq, 0, 3);
+  RANGE_CHECK(extra_cfg, dv_cost_upd_freq, 0, 3);
 
   RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
   RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
@@ -774,7 +991,7 @@
   input_cfg->input_bit_depth = cfg->g_input_bit_depth;
   // guess a frame rate if out of whack, use 30
   input_cfg->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
-  if (cfg->g_pass == AOM_RC_LAST_PASS) {
+  if (cfg->g_pass >= AOM_RC_SECOND_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
     input_cfg->limit = n_packets - 1;
@@ -819,10 +1036,17 @@
     dec_model_cfg->display_model_info_present_flag = 1;
   }
 
-  switch (cfg->g_pass) {
-    case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
-    case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
-    case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+  oxcf->pass = cfg->g_pass;
+  // For backward compatibility, assume that if extra_cfg->passes==-1, then
+  // passes = 1 or 2.
+  if (extra_cfg->passes == -1) {
+    if (cfg->g_pass == AOM_RC_ONE_PASS) {
+      oxcf->passes = 1;
+    } else {
+      oxcf->passes = 2;
+    }
+  } else {
+    oxcf->passes = extra_cfg->passes;
   }
 
   // Set Rate Control configuration.
@@ -886,6 +1110,10 @@
   q_cfg->deltaq_mode = extra_cfg->deltaq_mode;
   q_cfg->use_fixed_qp_offsets =
       cfg->use_fixed_qp_offsets && (rc_cfg->mode == AOM_Q);
+  q_cfg->enable_hdr_deltaq =
+      (q_cfg->deltaq_mode == DELTA_Q_HDR) &&
+      (cfg->g_bit_depth == AOM_BITS_10) &&
+      (extra_cfg->color_primaries == AOM_CICP_CP_BT_2020);
   for (int i = 0; i < FIXED_QP_OFFSET_COUNT; ++i) {
     if (q_cfg->use_fixed_qp_offsets) {
       if (cfg->fixed_qp_offsets[i] >= 0) {  // user-provided qp offset
@@ -907,8 +1135,7 @@
   oxcf->cost_upd_freq.coeff = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
   oxcf->cost_upd_freq.mode = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
   oxcf->cost_upd_freq.mv = (COST_UPDATE_TYPE)extra_cfg->mv_cost_upd_freq;
-  // TODO(chiyotsai@google.com): Add command line support in a separate cl.
-  oxcf->cost_upd_freq.dv = COST_UPD_SB;
+  oxcf->cost_upd_freq.dv = (COST_UPDATE_TYPE)extra_cfg->dv_cost_upd_freq;
 
   // Set frame resize mode configuration.
   resize_cfg->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
@@ -934,6 +1161,9 @@
   // Set two-pass stats configuration.
   oxcf->twopass_stats_in = cfg->rc_twopass_stats_in;
 
+  if (extra_cfg->two_pass_output)
+    oxcf->two_pass_output = extra_cfg->two_pass_output;
+
   // Set Key frame configuration.
   kf_cfg->fwd_kf_enabled = cfg->fwd_kf_enabled;
   kf_cfg->auto_key =
@@ -944,6 +1174,7 @@
   kf_cfg->sframe_mode = cfg->sframe_mode;
   kf_cfg->enable_sframe = extra_cfg->s_frame_mode;
   kf_cfg->enable_keyframe_filtering = extra_cfg->enable_keyframe_filtering;
+  kf_cfg->fwd_kf_dist = extra_cfg->fwd_kf_dist;
   // Disable key frame filtering in all intra mode.
   if (cfg->kf_max_dist == 0) {
     kf_cfg->enable_keyframe_filtering = 0;
@@ -960,7 +1191,10 @@
   color_cfg->chroma_sample_position = extra_cfg->chroma_sample_position;
 
   // Set Group of frames configuration.
-  gf_cfg->lag_in_frames = clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
+  // Force lag_in_frames to 0 for REALTIME mode
+  gf_cfg->lag_in_frames = (oxcf->mode == REALTIME)
+                              ? 0
+                              : clamp(cfg->g_lag_in_frames, 0, MAX_LAG_BUFFERS);
   gf_cfg->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
   gf_cfg->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
   gf_cfg->min_gf_interval = extra_cfg->min_gf_interval;
@@ -1036,10 +1270,15 @@
   // Set motion mode related configuration.
   oxcf->motion_mode_cfg.enable_obmc = extra_cfg->enable_obmc;
   oxcf->motion_mode_cfg.enable_warped_motion = extra_cfg->enable_warped_motion;
+#if !CONFIG_REALTIME_ONLY
   oxcf->motion_mode_cfg.allow_warped_motion =
-      (cfg->g_usage == AOM_USAGE_REALTIME)
+      (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+#else
+  oxcf->motion_mode_cfg.allow_warped_motion =
+      (cfg->g_usage == AOM_USAGE_REALTIME && oxcf->speed >= 7)
           ? false
           : (extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion);
+#endif
 
   // Set partition related configuration.
   part_cfg->enable_rect_partitions = extra_cfg->enable_rect_partitions;
@@ -1056,6 +1295,8 @@
   intra_mode_cfg->enable_smooth_intra = extra_cfg->enable_smooth_intra;
   intra_mode_cfg->enable_paeth_intra = extra_cfg->enable_paeth_intra;
   intra_mode_cfg->enable_cfl_intra = extra_cfg->enable_cfl_intra;
+  intra_mode_cfg->enable_directional_intra =
+      extra_cfg->enable_directional_intra;
   intra_mode_cfg->enable_diagonal_intra = extra_cfg->enable_diagonal_intra;
 
   // Set transform size/type configuration.
@@ -1066,6 +1307,7 @@
   txfm_cfg->use_intra_dct_only = extra_cfg->use_intra_dct_only;
   txfm_cfg->use_inter_dct_only = extra_cfg->use_inter_dct_only;
   txfm_cfg->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+  txfm_cfg->enable_tx_size_search = extra_cfg->enable_tx_size_search;
 
   // Set compound type configuration.
   comp_type_cfg->enable_dist_wtd_comp =
@@ -1134,6 +1376,8 @@
          sizeof(oxcf->target_seq_level_idx));
   oxcf->tier_mask = extra_cfg->tier_mask;
 
+  oxcf->partition_info_path = extra_cfg->partition_info_path;
+
   return AOM_CODEC_OK;
 }
 
@@ -1172,10 +1416,20 @@
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
     // On profile change, request a key frame
-    force_key |= ctx->ppi->cpi->common.seq_params.profile != ctx->oxcf.profile;
-    av1_change_config(ctx->ppi->cpi, &ctx->oxcf);
+    force_key |= ctx->ppi->seq_params.profile != ctx->oxcf.profile;
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf);
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
     }
   }
 
@@ -1185,7 +1439,7 @@
 }
 
 static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
-  return av1_get_global_headers(ctx->ppi->cpi);
+  return av1_get_global_headers(ctx->ppi);
 }
 
 static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
@@ -1208,7 +1462,7 @@
                                                      va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  *arg = ctx->ppi->cpi->rc.baseline_gf_interval;
+  *arg = ctx->ppi->p_rc.baseline_gf_interval;
   return AOM_CODEC_OK;
 }
 
@@ -1218,9 +1472,19 @@
   if (res == AOM_CODEC_OK) {
     ctx->extra_cfg = *extra_cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    av1_change_config(ctx->ppi->cpi, &ctx->oxcf);
+    bool is_sb_size_changed = false;
+    av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ctx->ppi->num_fp_contexts; i++) {
+      av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf,
+                        is_sb_size_changed);
+    }
+#else
+    av1_change_config(ctx->ppi->cpi, &ctx->oxcf, is_sb_size_changed);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ctx->ppi->cpi_lap != NULL) {
-      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf);
+      av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed);
     }
   }
   return res;
@@ -1292,7 +1556,13 @@
 static aom_codec_err_t ctrl_set_enable_tpl_model(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_tpl_model = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+  const unsigned int tpl_model_arg = CAST(AV1E_SET_ENABLE_TPL_MODEL, args);
+#if CONFIG_REALTIME_ONLY
+  if (tpl_model_arg) {
+    ERROR("TPL model can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_tpl_model = tpl_model_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1372,7 +1642,13 @@
 static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
                                                    va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+  const unsigned int restoration_arg = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+#if CONFIG_REALTIME_ONLY
+  if (restoration_arg) {
+    ERROR("Restoration can't be turned on in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_restoration = restoration_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1386,7 +1662,13 @@
 static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+  const unsigned int obmc_arg = CAST(AV1E_SET_ENABLE_OBMC, args);
+#if CONFIG_REALTIME_ONLY
+  if (obmc_arg) {
+    ERROR("OBMC can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_obmc = obmc_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1630,14 +1912,26 @@
 static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+  const int global_motion_arg = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (global_motion_arg) {
+    ERROR("Global motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_global_motion = global_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
 static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+  const int warped_motion_arg = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+#if CONFIG_REALTIME_ONLY
+  if (warped_motion_arg) {
+    ERROR("Warped motion can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.enable_warped_motion = warped_motion_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1662,6 +1956,14 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_directional_intra(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_directional_intra =
+      CAST(AV1E_SET_ENABLE_DIRECTIONAL_INTRA, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_enable_diagonal_intra(aom_codec_alg_priv_t *ctx,
                                                       va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1783,6 +2085,13 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_enable_tx_size_search(aom_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tx_size_search = CAST(AV1E_SET_ENABLE_TX_SIZE_SEARCH, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1818,10 +2127,32 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_dv_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.dv_cost_upd_freq = CAST(AV1E_SET_DV_COST_UPD_FREQ, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_err_t ctrl_set_vmaf_model_path(aom_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.vmaf_model_path = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  const char *str = CAST(AV1E_SET_VMAF_MODEL_PATH, args);
+  const aom_codec_err_t ret = allocate_and_set_string(
+      str, default_extra_cfg.vmaf_model_path, &extra_cfg.vmaf_model_path,
+      ctx->ppi->error.detail);
+  if (ret != AOM_CODEC_OK) return ret;
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_partition_info_path(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  const char *str = CAST(AV1E_SET_PARTITION_INFO_PATH, args);
+  const aom_codec_err_t ret = allocate_and_set_string(
+      str, default_extra_cfg.partition_info_path,
+      &extra_cfg.partition_info_path, ctx->ppi->error.detail);
+  if (ret != AOM_CODEC_OK) return ret;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1836,7 +2167,16 @@
 static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
                                                  va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  const char *str = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+  if (str == NULL) {
+    // this parameter allows NULL as its value
+    extra_cfg.film_grain_table_filename = str;
+  } else {
+    const aom_codec_err_t ret = allocate_and_set_string(
+        str, default_extra_cfg.film_grain_table_filename,
+        &extra_cfg.film_grain_table_filename, ctx->ppi->error.detail);
+    if (ret != AOM_CODEC_OK) return ret;
+  }
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1883,7 +2223,13 @@
 static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
                                             va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
-  extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+  const DELTAQ_MODE deltaq_arg = CAST(AV1E_SET_DELTAQ_MODE, args);
+#if CONFIG_REALTIME_ONLY
+  if (deltaq_arg > NO_DELTA_Q) {
+    ERROR("Delta Q mode can't be enabled in realtime only build.");
+  }
+#endif
+  extra_cfg.deltaq_mode = deltaq_arg;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1979,6 +2325,18 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_external_partition(aom_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  aom_ext_part_funcs_t funcs = *CAST(AV1E_SET_EXTERNAL_PARTITION, args);
+  aom_ext_part_config_t config;
+  // TODO(chengchen): verify the sb_size has been set at this point.
+  config.superblock_size = cpi->common.seq_params->sb_size;
+  const aom_codec_err_t status =
+      av1_ext_part_create(funcs, config, &cpi->ext_part_controller);
+  return status;
+}
+
 #if !CONFIG_REALTIME_ONLY
 static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer,
                                            STATS_BUFFER_CTX *stats_buf_context,
@@ -2007,22 +2365,21 @@
 
 static aom_codec_err_t create_context_and_bufferpool(
     AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool,
-    AV1EncoderConfig *oxcf, FIRSTPASS_STATS *frame_stats_buf,
-    COMPRESSOR_STAGE stage, int num_lap_buffers, int lap_lag_in_frames,
-    STATS_BUFFER_CTX *stats_buf_context) {
+    AV1EncoderConfig *oxcf, COMPRESSOR_STAGE stage, int lap_lag_in_frames) {
   aom_codec_err_t res = AOM_CODEC_OK;
 
-  *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
-  if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+  if (*p_buffer_pool == NULL) {
+    *p_buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+    if (*p_buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
 
 #if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
-    return AOM_CODEC_MEM_ERROR;
-  }
+    if (pthread_mutex_init(&((*p_buffer_pool)->pool_mutex), NULL)) {
+      return AOM_CODEC_MEM_ERROR;
+    }
 #endif
-  *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, frame_stats_buf,
-                                 stage, num_lap_buffers, lap_lag_in_frames,
-                                 stats_buf_context);
+  }
+  *p_cpi = av1_create_compressor(ppi, oxcf, *p_buffer_pool, stage,
+                                 lap_lag_in_frames);
   if (*p_cpi == NULL) res = AOM_CODEC_MEM_ERROR;
 
   return res;
@@ -2044,6 +2401,14 @@
     ctx->config.enc = &priv->cfg;
 
     priv->extra_cfg = default_extra_cfg;
+    // Special handling:
+    // By default, if omitted, --enable-cdef = 1.
+    // Here we set its default value to 0 when --allintra is turned on.
+    // However, if users set --enable-cdef = 1 from command line,
+    // The encoder still respects it.
+    if (priv->cfg.g_usage == ALLINTRA) {
+      priv->extra_cfg.enable_cdef = 0;
+    }
     aom_once(av1_initialize_enc);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -2058,8 +2423,8 @@
       reduce_ratio(&priv->timestamp_ratio);
 
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
-      if (priv->oxcf.rc_cfg.mode != AOM_CBR && priv->oxcf.pass == 0 &&
-          priv->oxcf.mode == GOOD) {
+      if (priv->oxcf.rc_cfg.mode != AOM_CBR &&
+          priv->oxcf.pass == AOM_RC_ONE_PASS && priv->oxcf.mode == GOOD) {
         // Enable look ahead - enabled for AOM_Q, AOM_CQ, AOM_VBR
         *num_lap_buffers =
             AOMMIN((int)priv->cfg.g_lag_in_frames,
@@ -2073,27 +2438,60 @@
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 
-      priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head);
+      priv->ppi = av1_create_primary_compressor(&priv->pkt_list.head,
+                                                *num_lap_buffers, &priv->oxcf);
       if (!priv->ppi) return AOM_CODEC_MEM_ERROR;
 
 #if !CONFIG_REALTIME_ONLY
       res = create_stats_buffer(&priv->frame_stats_buffer,
                                 &priv->stats_buf_context, *num_lap_buffers);
       if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR;
+
+      assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
+      int size = get_stats_buf_size(*num_lap_buffers, MAX_LAG_BUFFERS);
+      for (int i = 0; i < size; i++)
+        priv->ppi->twopass.frame_stats_arr[i] = &priv->frame_stats_buffer[i];
+
+      priv->ppi->twopass.stats_buf_ctx = &priv->stats_buf_context;
 #endif
 
-      res = create_context_and_bufferpool(
-          priv->ppi, &priv->ppi->cpi, &priv->buffer_pool, &priv->oxcf,
-          priv->frame_stats_buffer, ENCODE_STAGE, *num_lap_buffers, -1,
-          &priv->stats_buf_context);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      assert(priv->ppi->num_fp_contexts >= 1);
+      int i;
+      for (i = 0; i < priv->ppi->num_fp_contexts; i++) {
+        res = create_context_and_bufferpool(
+            priv->ppi, &priv->ppi->parallel_cpi[i], &priv->buffer_pool,
+            &priv->oxcf, ENCODE_STAGE, -1);
+        if (res != AOM_CODEC_OK) {
+          return res;
+        }
+        if (i == 0) {
+          // Calculate the maximum number of frames that can be encoded in
+          // parallel
+          priv->ppi->num_fp_contexts = av1_compute_num_fp_contexts(
+              priv->ppi, &priv->ppi->parallel_cpi[i]->oxcf);
+        }
+#if !CONFIG_REALTIME_ONLY
+        priv->ppi->parallel_cpi[i]->twopass_frame.stats_in =
+            priv->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+      }
+      priv->ppi->cpi = priv->ppi->parallel_cpi[0];
+#else
+      res = create_context_and_bufferpool(priv->ppi, &priv->ppi->cpi,
+                                          &priv->buffer_pool, &priv->oxcf,
+                                          ENCODE_STAGE, -1);
+#if !CONFIG_REALTIME_ONLY
+      priv->ppi->cpi->twopass_frame.stats_in =
+          priv->ppi->twopass.stats_buf_ctx->stats_in_start;
+#endif
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
       // Create another compressor if look ahead is enabled
       if (res == AOM_CODEC_OK && *num_lap_buffers) {
         res = create_context_and_bufferpool(
             priv->ppi, &priv->ppi->cpi_lap, &priv->buffer_pool_lap, &priv->oxcf,
-            priv->frame_stats_buffer, LAP_STAGE, *num_lap_buffers,
-            clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS),
-            &priv->stats_buf_context);
+            LAP_STAGE, clamp(lap_lag_in_frames, 0, MAX_LAG_BUFFERS));
       }
     }
   }
@@ -2102,12 +2500,16 @@
 }
 
 static void destroy_context_and_bufferpool(AV1_COMP *cpi,
-                                           BufferPool *buffer_pool) {
+                                           BufferPool **p_buffer_pool) {
   av1_remove_compressor(cpi);
+  if (*p_buffer_pool) {
+    av1_free_ref_frame_buffers(*p_buffer_pool);
 #if CONFIG_MULTITHREAD
-  if (buffer_pool) pthread_mutex_destroy(&buffer_pool->pool_mutex);
+    pthread_mutex_destroy(&(*p_buffer_pool)->pool_mutex);
 #endif
-  aom_free(buffer_pool);
+    aom_free(*p_buffer_pool);
+    *p_buffer_pool = NULL;
+  }
 }
 
 static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context,
@@ -2117,19 +2519,62 @@
   aom_free(frame_stats_buffer);
 }
 
+static void check_and_free_string(const char *default_str, const char **ptr) {
+  if (*ptr == default_str) {
+    // Default should be a literal. Do not free.
+    return;
+  }
+  aom_free((void *)*ptr);
+  *ptr = NULL;
+}
+
+static void destroy_extra_config(struct av1_extracfg *extra_cfg) {
+#if CONFIG_TUNE_VMAF
+  check_and_free_string(default_extra_cfg.vmaf_model_path,
+                        &extra_cfg->vmaf_model_path);
+#endif
+  check_and_free_string(default_extra_cfg.two_pass_output,
+                        &extra_cfg->two_pass_output);
+  check_and_free_string(default_extra_cfg.partition_info_path,
+                        &extra_cfg->partition_info_path);
+  check_and_free_string(default_extra_cfg.film_grain_table_filename,
+                        &extra_cfg->film_grain_table_filename);
+}
+
 static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
+  destroy_extra_config(&ctx->extra_cfg);
 
   if (ctx->ppi) {
     AV1_PRIMARY *ppi = ctx->ppi;
-    destroy_context_and_bufferpool(ppi->cpi, ctx->buffer_pool);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    for (int i = 0; i < ppi->num_fp_contexts - 1; i++) {
+      if (ppi->parallel_frames_data[i].cx_data) {
+        free(ppi->parallel_frames_data[i].cx_data);
+      }
+    }
+#endif
+#if CONFIG_ENTROPY_STATS
+    print_entropy_stats(ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+    print_internal_stats(ppi);
+#endif
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int i;
+    for (i = 0; i < ppi->num_fp_contexts; i++) {
+      destroy_context_and_bufferpool(ppi->parallel_cpi[i], &ctx->buffer_pool);
+    }
+    ppi->cpi = NULL;
+#else
+    destroy_context_and_bufferpool(ppi->cpi, &ctx->buffer_pool);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     if (ppi->cpi_lap) {
-      destroy_context_and_bufferpool(ppi->cpi_lap, ctx->buffer_pool_lap);
+      destroy_context_and_bufferpool(ppi->cpi_lap, &ctx->buffer_pool_lap);
     }
     av1_remove_primary_compressor(ppi);
   }
   destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer);
-
   aom_free(ctx);
   return AOM_CODEC_OK;
 }
@@ -2140,7 +2585,7 @@
   aom_codec_frame_flags_t flags = lib_flags << 16;
 
   if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc &&
+      (cpi->ppi->use_svc &&
        svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
                           svc->temporal_layer_id]
            .is_key_frame))
@@ -2164,14 +2609,17 @@
   const size_t kMinCompressedSize = 8192;
   volatile aom_codec_err_t res = AOM_CODEC_OK;
   AV1_PRIMARY *const ppi = ctx->ppi;
-  AV1_COMP *const cpi = ppi->cpi;
-  const aom_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
   volatile aom_codec_pts_t ptsvol = pts;
+  AV1_COMP_DATA cpi_data = { 0 };
+
+  cpi_data.timestamp_ratio = &ctx->timestamp_ratio;
+  cpi_data.flush = !img;
   // LAP context
   AV1_COMP *cpi_lap = ppi->cpi_lap;
-  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ppi->cpi == NULL) return AOM_CODEC_INVALID_PARAM;
 
-  if (cpi->lap_enabled && cpi_lap == NULL && cpi->oxcf.pass == 0)
+  if (ppi->lap_enabled && cpi_lap == NULL &&
+      ppi->cpi->oxcf.pass == AOM_RC_ONE_PASS)
     return AOM_CODEC_INVALID_PARAM;
 
   if (img != NULL) {
@@ -2191,8 +2639,8 @@
       // frame size. Hence the size of the buffer is chosen as 2 times the
       // uncompressed frame size.
       int multiplier = 8;
-      if (cpi->oxcf.kf_cfg.key_freq_max == 0 &&
-          !cpi->oxcf.kf_cfg.fwd_kf_enabled)
+      if (ppi->cpi->oxcf.kf_cfg.key_freq_max == 0 &&
+          !ppi->cpi->oxcf.kf_cfg.fwd_kf_enabled)
         multiplier = 2;
       size_t data_sz = uncompressed_frame_sz * multiplier;
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
@@ -2205,6 +2653,21 @@
           return AOM_CODEC_MEM_ERROR;
         }
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      for (int i = 0; i < ppi->num_fp_contexts - 1; i++) {
+        if (ppi->parallel_frames_data[i].cx_data == NULL) {
+          ppi->parallel_frames_data[i].cx_data_sz = uncompressed_frame_sz;
+          ppi->parallel_frames_data[i].frame_display_order_hint = -1;
+          ppi->parallel_frames_data[i].frame_size = 0;
+          ppi->parallel_frames_data[i].cx_data =
+              (unsigned char *)malloc(ppi->parallel_frames_data[i].cx_data_sz);
+          if (ppi->parallel_frames_data[i].cx_data == NULL) {
+            ppi->parallel_frames_data[i].cx_data_sz = 0;
+            return AOM_CODEC_MEM_ERROR;
+          }
+        }
+      }
+#endif
     }
   }
 
@@ -2215,30 +2678,20 @@
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    res = update_error_state(ctx, &cpi->common.error);
-    aom_clear_system_state();
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    res = update_error_state(ctx, &ppi->error);
     return res;
   }
-  cpi->common.error.setjmp = 1;
-  if (cpi_lap != NULL) {
-    if (setjmp(cpi_lap->common.error.jmp)) {
-      cpi_lap->common.error.setjmp = 0;
-      res = update_error_state(ctx, &cpi_lap->common.error);
-      aom_clear_system_state();
-      return res;
-    }
-    cpi_lap->common.error.setjmp = 1;
-  }
+  ppi->error.setjmp = 1;
 
-  if (cpi->use_svc && cpi->svc.use_flexible_mode == 0 && flags == 0)
-    av1_set_svc_fixed_mode(cpi);
+  if (ppi->use_svc && ppi->cpi->svc.use_flexible_mode == 0 && flags == 0)
+    av1_set_svc_fixed_mode(ppi->cpi);
 
   // Note(yunqing): While applying encoding flags, always start from enabling
   // all, and then modifying according to the flags. Previous frame's flags are
   // overwritten.
-  av1_apply_encoding_flags(cpi, flags);
+  av1_apply_encoding_flags(ppi->cpi, flags);
   if (cpi_lap != NULL) {
     av1_apply_encoding_flags(cpi_lap, flags);
   }
@@ -2246,16 +2699,16 @@
 #if CONFIG_TUNE_VMAF
   if (ctx->extra_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
       ctx->extra_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN) {
-    aom_init_vmaf_model(&cpi->vmaf_info.vmaf_model,
-                        cpi->oxcf.tune_cfg.vmaf_model_path);
+    aom_init_vmaf_model(&ppi->cpi->vmaf_info.vmaf_model,
+                        ppi->cpi->oxcf.tune_cfg.vmaf_model_path);
   }
 #endif
 
   // Handle fixed keyframe intervals
-  if (is_stat_generation_stage(cpi)) {
+  if (is_stat_generation_stage(ppi->cpi)) {
     if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
         ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
-      if (cpi->common.spatial_layer_id == 0 &&
+      if (ppi->cpi->common.spatial_layer_id == 0 &&
           ++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
         flags |= AOM_EFLAG_FORCE_KF;
         ctx->fixed_kf_cntr = 1;
@@ -2264,8 +2717,14 @@
   }
 
   if (res == AOM_CODEC_OK) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    AV1_COMP *cpi = ppi->cpi;
+#else
+    AV1_COMP *const cpi = ppi->cpi;
+#endif
+
     // Set up internal flags
-    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+    if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) ppi->b_calculate_psnr = 1;
 
     if (img != NULL) {
       if (!ctx->pts_offset_initialized) {
@@ -2273,9 +2732,10 @@
         ctx->pts_offset_initialized = 1;
       }
       ptsvol -= ctx->pts_offset;
-      int64_t src_time_stamp = timebase_units_to_ticks(timestamp_ratio, ptsvol);
+      int64_t src_time_stamp =
+          timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol);
       int64_t src_end_time_stamp =
-          timebase_units_to_ticks(timestamp_ratio, ptsvol + duration);
+          timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration);
 
       YV12_BUFFER_CONFIG sd;
       res = image2yuvconfig(img, &sd);
@@ -2301,11 +2761,18 @@
             cpi->oxcf.tool_cfg.enable_global_motion);
       }
       if (!ppi->lookahead)
-        aom_internal_error(&cpi->common.error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate lag buffers");
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      int i;
+      for (i = 0; i < ppi->num_fp_contexts; i++) {
+        av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth,
+                                subsampling_x, subsampling_y);
+      }
+#else
       av1_check_initial_width(cpi, use_highbitdepth, subsampling_x,
                               subsampling_y);
+#endif
       if (cpi_lap != NULL) {
         av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x,
                                 subsampling_y);
@@ -2315,87 +2782,135 @@
       // key frame flag when we actually encode this frame.
       if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
                                 src_time_stamp, src_end_time_stamp)) {
-        res = update_error_state(ctx, &cpi->common.error);
+        res = update_error_state(ctx, &ppi->error);
       }
       ctx->next_frame_flags = 0;
     }
 
-    unsigned char *cx_data = ctx->cx_data;
-    size_t cx_data_sz = ctx->cx_data_sz;
+    cpi_data.cx_data = ctx->cx_data;
+    cpi_data.cx_data_sz = ctx->cx_data_sz;
 
     /* Any pending invisible frames? */
     if (ctx->pending_cx_data_sz) {
-      cx_data += ctx->pending_cx_data_sz;
-      cx_data_sz -= ctx->pending_cx_data_sz;
+      cpi_data.cx_data += ctx->pending_cx_data_sz;
+      cpi_data.cx_data_sz -= ctx->pending_cx_data_sz;
 
       /* TODO: this is a minimal check, the underlying codec doesn't respect
        * the buffer size anyway.
        */
-      if (cx_data_sz < ctx->cx_data_sz / 2) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+      if (cpi_data.cx_data_sz < ctx->cx_data_sz / 2) {
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
                            "Compressed data buffer too small");
       }
     }
 
-    size_t frame_size = 0;
-    unsigned int lib_flags = 0;
     int is_frame_visible = 0;
     int has_no_show_keyframe = 0;
     int num_workers = 0;
 
-    if (cpi->oxcf.pass == 1) {
+    if (cpi->oxcf.pass == AOM_RC_FIRST_PASS) {
 #if !CONFIG_REALTIME_ONLY
-      num_workers = av1_fp_compute_num_enc_workers(cpi);
+      num_workers = ppi->p_mt_info.num_mod_workers[MOD_FP] =
+          av1_fp_compute_num_enc_workers(cpi);
 #endif
     } else {
       av1_compute_num_workers_for_mt(cpi);
       num_workers = av1_get_max_num_workers(cpi);
     }
-    if ((num_workers > 1) && (cpi->mt_info.num_workers == 0)) {
-      av1_create_workers(cpi, num_workers);
-      if (cpi->oxcf.pass != 1) {
-        av1_create_second_pass_workers(cpi, num_workers);
+    if ((num_workers > 1) && (ppi->p_mt_info.num_workers == 0)) {
+      av1_create_workers(ppi, num_workers);
+      av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS);
+#if CONFIG_MULTITHREAD
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      for (int i = 0; i < ppi->num_fp_contexts; i++) {
+        av1_init_mt_sync(ppi->parallel_cpi[i],
+                         ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS);
       }
+#else
+      av1_init_mt_sync(cpi, cpi->oxcf.pass == 1);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      if (cpi_lap != NULL) {
+        av1_init_mt_sync(cpi_lap, 1);
+      }
+#endif  // CONFIG_MULTITHREAD
+    }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    for (int i = 0; i < ppi->num_fp_contexts; i++) {
+      av1_init_frame_mt(ppi, ppi->parallel_cpi[i]);
+    }
+#else
+    av1_init_frame_mt(ppi, cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (cpi_lap != NULL) {
+      av1_init_frame_mt(ppi, cpi_lap);
     }
 
     // Call for LAP stage
     if (cpi_lap != NULL) {
-      int64_t dst_time_stamp_la;
-      int64_t dst_end_time_stamp_la;
-      if (cpi_lap->mt_info.workers == NULL) {
-        cpi_lap->mt_info.workers = cpi->mt_info.workers;
-        cpi_lap->mt_info.tile_thr_data = cpi->mt_info.tile_thr_data;
-      }
-      cpi_lap->mt_info.num_workers = cpi->mt_info.num_workers;
-      const int status = av1_get_compressed_data(
-          cpi_lap, &lib_flags, &frame_size, NULL, &dst_time_stamp_la,
-          &dst_end_time_stamp_la, !img, timestamp_ratio);
+      AV1_COMP_DATA cpi_lap_data = { 0 };
+      cpi_lap_data.flush = !img;
+      cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio;
+      const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data);
       if (status != -1) {
         if (status != AOM_CODEC_OK) {
-          aom_internal_error(&cpi_lap->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
-        cpi_lap->ppi->seq_params_locked = 1;
       }
-      lib_flags = 0;
-      frame_size = 0;
+      av1_post_encode_updates(cpi_lap, &cpi_lap_data);
     }
 
     // Get the next visible frame. Invisible frames get packed with the next
     // visible frame.
-    int64_t dst_time_stamp;
-    int64_t dst_end_time_stamp;
-    while (cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
-      const int status = av1_get_compressed_data(
-          cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp,
-          &dst_end_time_stamp, !img, timestamp_ratio);
+    while (cpi_data.cx_data_sz >= ctx->cx_data_sz / 2 && !is_frame_visible) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      int status = -1;
+      cpi->do_frame_data_update = true;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      cpi->ref_idx_to_skip = INVALID_IDX;
+      cpi->ref_refresh_index = INVALID_IDX;
+      cpi->refresh_idx_available = false;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+      if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+        status = av1_get_compressed_data(cpi, &cpi_data);
+      } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1) {
+        status = av1_compress_parallel_frames(ppi, &cpi_data);
+      } else {
+        cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data);
+        status = AOM_CODEC_OK;
+      }
+#else
+      const int status = av1_get_compressed_data(cpi, &cpi_data);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       if (status == -1) break;
       if (status != AOM_CODEC_OK) {
-        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) {
+        av1_init_sc_decisions(ppi);
+      }
+#endif
 
-      cpi->ppi->seq_params_locked = 1;
-      if (!frame_size) continue;
-      assert(cx_data != NULL && cx_data_sz != 0);
+      ppi->seq_params_locked = 1;
+      av1_post_encode_updates(cpi, &cpi_data);
+
+#if CONFIG_ENTROPY_STATS
+      if (ppi->cpi->oxcf.pass != 1 && !cpi->common.show_existing_frame)
+        av1_accumulate_frame_counts(&ppi->aggregate_fc, &cpi->counts);
+#endif
+#if CONFIG_INTERNAL_STATS
+      if (ppi->cpi->oxcf.pass != 1) {
+        ppi->total_time_compress_data += cpi->time_compress_data;
+        ppi->total_recode_hits += cpi->frame_recode_hits;
+        ppi->total_bytes += cpi->bytes;
+        for (int i = 0; i < MAX_MODES; i++) {
+          ppi->total_mode_chosen_counts[i] += cpi->mode_chosen_counts[i];
+        }
+      }
+#endif  // CONFIG_INTERNAL_STATS
+
+      if (!cpi_data.frame_size) continue;
+      assert(cpi_data.cx_data != NULL && cpi_data.cx_data_sz != 0);
       const int write_temporal_delimiter =
           !cpi->common.spatial_layer_id && !ctx->pending_cx_data_sz;
 
@@ -2406,41 +2921,45 @@
             aom_uleb_size_in_bytes(obu_payload_size);
 
         const size_t move_offset = obu_header_size + length_field_size;
-        memmove(ctx->cx_data + move_offset, ctx->cx_data, frame_size);
-        obu_header_size = av1_write_obu_header(
-            &cpi->level_params, OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
+        memmove(ctx->cx_data + move_offset, ctx->cx_data, cpi_data.frame_size);
+        obu_header_size =
+            av1_write_obu_header(&ppi->level_params, &cpi->frame_header_count,
+                                 OBU_TEMPORAL_DELIMITER, 0, ctx->cx_data);
 
         // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
                                     ctx->cx_data) != AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
 
-        frame_size += obu_header_size + obu_payload_size + length_field_size;
+        cpi_data.frame_size +=
+            obu_header_size + obu_payload_size + length_field_size;
       }
 
       if (ctx->oxcf.save_as_annexb) {
-        size_t curr_frame_size = frame_size;
-        if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
-            AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        size_t curr_frame_size = cpi_data.frame_size;
+        if (av1_convert_sect5obus_to_annexb(cpi_data.cx_data,
+                                            &curr_frame_size) != AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
-        frame_size = curr_frame_size;
+        cpi_data.frame_size = curr_frame_size;
 
         // B_PRIME (add frame size)
-        const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
-        memmove(cx_data + length_field_size, cx_data, frame_size);
-        if (av1_write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
-            AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+        const size_t length_field_size =
+            aom_uleb_size_in_bytes(cpi_data.frame_size);
+        memmove(cpi_data.cx_data + length_field_size, cpi_data.cx_data,
+                cpi_data.frame_size);
+        if (av1_write_uleb_obu_size(0, (uint32_t)cpi_data.frame_size,
+                                    cpi_data.cx_data) != AOM_CODEC_OK) {
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
-        frame_size += length_field_size;
+        cpi_data.frame_size += length_field_size;
       }
 
-      ctx->pending_cx_data_sz += frame_size;
+      ctx->pending_cx_data_sz += cpi_data.frame_size;
 
-      cx_data += frame_size;
-      cx_data_sz -= frame_size;
+      cpi_data.cx_data += cpi_data.frame_size;
+      cpi_data.cx_data_sz -= cpi_data.frame_size;
 
       is_frame_visible = cpi->common.show_frame;
 
@@ -2453,7 +2972,7 @@
       aom_codec_cx_pkt_t pkt;
 
       // decrement frames_left counter
-      cpi->frames_left = AOMMAX(0, cpi->frames_left - 1);
+      ppi->frames_left = AOMMAX(0, ppi->frames_left - 1);
       if (ctx->oxcf.save_as_annexb) {
         //  B_PRIME (add TU size)
         size_t tu_size = ctx->pending_cx_data_sz;
@@ -2461,7 +2980,7 @@
         memmove(ctx->cx_data + length_field_size, ctx->cx_data, tu_size);
         if (av1_write_uleb_obu_size(0, (uint32_t)tu_size, ctx->cx_data) !=
             AOM_CODEC_OK) {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+          aom_internal_error(&ppi->error, AOM_CODEC_ERROR, NULL);
         }
         ctx->pending_cx_data_sz += length_field_size;
       }
@@ -2471,19 +2990,20 @@
       pkt.data.frame.buf = ctx->cx_data;
       pkt.data.frame.sz = ctx->pending_cx_data_sz;
       pkt.data.frame.partition_id = -1;
-      pkt.data.frame.vis_frame_size = frame_size;
+      pkt.data.frame.vis_frame_size = cpi_data.frame_size;
 
-      pkt.data.frame.pts =
-          ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
-          ctx->pts_offset;
-      pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+      pkt.data.frame.pts = ticks_to_timebase_units(cpi_data.timestamp_ratio,
+                                                   cpi_data.ts_frame_start) +
+                           ctx->pts_offset;
+      pkt.data.frame.flags = get_frame_pkt_flags(cpi, cpi_data.lib_flags);
       if (has_no_show_keyframe) {
         // If one of the invisible frames in the packet is a keyframe, set
         // the delayed random access point flag.
         pkt.data.frame.flags |= AOM_FRAME_IS_DELAYED_RANDOM_ACCESS_POINT;
       }
       pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
-          timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
+          cpi_data.timestamp_ratio,
+          cpi_data.ts_frame_end - cpi_data.ts_frame_start);
 
       aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
 
@@ -2491,7 +3011,7 @@
     }
   }
 
-  cpi->common.error.setjmp = 0;
+  ppi->error.setjmp = 0;
   return res;
 }
 
@@ -2669,7 +3189,7 @@
   const int number_spatial_layers = va_arg(args, int);
   if (number_spatial_layers > MAX_NUM_SPATIAL_LAYERS)
     return AOM_CODEC_INVALID_PARAM;
-  ctx->ppi->cpi->common.number_spatial_layers = number_spatial_layers;
+  ctx->ppi->number_spatial_layers = number_spatial_layers;
   return AOM_CODEC_OK;
 }
 
@@ -2685,19 +3205,20 @@
 
 static aom_codec_err_t ctrl_set_svc_params(aom_codec_alg_priv_t *ctx,
                                            va_list args) {
-  AV1_COMP *const cpi = ctx->ppi->cpi;
+  AV1_PRIMARY *const ppi = ctx->ppi;
+  AV1_COMP *const cpi = ppi->cpi;
   AV1_COMMON *const cm = &cpi->common;
   aom_svc_params_t *const params = va_arg(args, aom_svc_params_t *);
-  cm->number_spatial_layers = params->number_spatial_layers;
-  cm->number_temporal_layers = params->number_temporal_layers;
+  ppi->number_spatial_layers = params->number_spatial_layers;
+  ppi->number_temporal_layers = params->number_temporal_layers;
   cpi->svc.number_spatial_layers = params->number_spatial_layers;
   cpi->svc.number_temporal_layers = params->number_temporal_layers;
-  if (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1) {
+  if (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1) {
     unsigned int sl, tl;
-    cpi->use_svc = 1;
-    for (sl = 0; sl < cm->number_spatial_layers; ++sl) {
-      for (tl = 0; tl < cm->number_temporal_layers; ++tl) {
-        const int layer = LAYER_IDS_TO_IDX(sl, tl, cm->number_temporal_layers);
+    ctx->ppi->use_svc = 1;
+    for (sl = 0; sl < ppi->number_spatial_layers; ++sl) {
+      for (tl = 0; tl < ppi->number_temporal_layers; ++tl) {
+        const int layer = LAYER_IDS_TO_IDX(sl, tl, ppi->number_temporal_layers);
         LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
         lc->max_q = params->max_quantizers[layer];
         lc->min_q = params->min_quantizers[layer];
@@ -2709,10 +3230,10 @@
     }
     if (cm->current_frame.frame_number == 0) {
       if (!cpi->ppi->seq_params_locked) {
-        SequenceHeader *const seq_params = &cm->seq_params;
+        SequenceHeader *const seq_params = &ppi->seq_params;
         seq_params->operating_points_cnt_minus_1 =
-            cm->number_spatial_layers * cm->number_temporal_layers - 1;
-        av1_init_seq_coding_tools(&cm->seq_params, cm, &cpi->oxcf, 1);
+            ppi->number_spatial_layers * ppi->number_temporal_layers - 1;
+        av1_init_seq_coding_tools(ppi, &cpi->oxcf, 1);
       }
       av1_init_layer_context(cpi);
     }
@@ -2739,6 +3260,17 @@
   return AOM_CODEC_OK;
 }
 
+static aom_codec_err_t ctrl_set_svc_ref_frame_comp_pred(
+    aom_codec_alg_priv_t *ctx, va_list args) {
+  AV1_COMP *const cpi = ctx->ppi->cpi;
+  aom_svc_ref_frame_comp_pred_t *const data =
+      va_arg(args, aom_svc_ref_frame_comp_pred_t *);
+  cpi->svc.ref_frame_comp[0] = data->use_comp_pred[0];
+  cpi->svc.ref_frame_comp[1] = data->use_comp_pred[1];
+  cpi->svc.ref_frame_comp[2] = data->use_comp_pred[2];
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -2828,23 +3360,23 @@
   // Used to mock the argv with just one string "--{name}={value}"
   char *argv[2] = { NULL, "" };
   size_t len = strlen(name) + strlen(value) + 4;
-  char *err_string = ctx->ppi->cpi->common.error.detail;
+  char *const err_string = ctx->ppi->error.detail;
 
 #if __STDC_VERSION__ >= 201112L
   // We use the keyword _Static_assert because clang-cl does not allow the
   // convenience macro static_assert to be used in function scope. See
   // https://bugs.llvm.org/show_bug.cgi?id=48904.
-  _Static_assert(
-      sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN,
-      "The size of the err_msg buffer for arg_match_helper must be "
-      "at least ARG_ERR_MSG_MAX_LEN");
+  _Static_assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN,
+                 "The size of the err_msg buffer for arg_match_helper must be "
+                 "at least ARG_ERR_MSG_MAX_LEN");
 #else
-  assert(sizeof(ctx->ppi->cpi->common.error.detail) >= ARG_ERR_MSG_MAX_LEN);
+  assert(sizeof(ctx->ppi->error.detail) >= ARG_ERR_MSG_MAX_LEN);
 #endif
 
   argv[0] = aom_malloc(len * sizeof(argv[1][0]));
   snprintf(argv[0], len, "--%s=%s", name, value);
   struct arg arg;
+  aom_codec_err_t err = AOM_CODEC_OK;
 
   int match = 1;
   if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_keyframe_filtering,
@@ -2903,11 +3435,16 @@
 #if CONFIG_TUNE_VMAF
   else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.vmaf_model_path, argv,
                             err_string)) {
-    extra_cfg.vmaf_model_path = value;
+    err = allocate_and_set_string(value, default_extra_cfg.vmaf_model_path,
+                                  &extra_cfg.vmaf_model_path, err_string);
   }
 #endif
-  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
-                            err_string)) {
+  else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.partition_info_path,
+                            argv, err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.partition_info_path,
+                                  &extra_cfg.partition_info_path, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cq_level, argv,
+                              err_string)) {
     extra_cfg.cq_level = arg_parse_uint_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.max_intra_rate_pct,
                               argv, err_string)) {
@@ -3012,7 +3549,14 @@
     extra_cfg.film_grain_test_vector = arg_parse_int_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.film_grain_table,
                               argv, err_string)) {
-    extra_cfg.film_grain_table_filename = value;
+    if (value == NULL) {
+      // this parameter allows NULL as its value
+      extra_cfg.film_grain_table_filename = value;
+    } else {
+      err = allocate_and_set_string(
+          value, default_extra_cfg.film_grain_table_filename,
+          &extra_cfg.film_grain_table_filename, err_string);
+    }
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.cdf_update_mode, argv,
                               err_string)) {
     extra_cfg.cdf_update_mode = arg_parse_int_helper(&arg, err_string);
@@ -3106,6 +3650,10 @@
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_cfl_intra,
                               argv, err_string)) {
     extra_cfg.enable_cfl_intra = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg,
+                              &g_av1_codec_arg_defs.enable_directional_intra,
+                              argv, err_string)) {
+    extra_cfg.enable_directional_intra = arg_parse_int_helper(&arg, err_string);
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.enable_diagonal_intra,
                               argv, err_string)) {
     extra_cfg.enable_diagonal_intra = arg_parse_int_helper(&arg, err_string);
@@ -3158,6 +3706,9 @@
   } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.mv_cost_upd_freq,
                               argv, err_string)) {
     extra_cfg.mv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.dv_cost_upd_freq,
+                              argv, err_string)) {
+    extra_cfg.dv_cost_upd_freq = arg_parse_uint_helper(&arg, err_string);
   }
 #if CONFIG_DENOISE
   else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.denoise_noise_level,
@@ -3189,6 +3740,16 @@
                               &g_av1_codec_arg_defs.input_chroma_subsampling_y,
                               argv, err_string)) {
     extra_cfg.chroma_subsampling_y = arg_parse_uint_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.passes, argv,
+                              err_string)) {
+    extra_cfg.passes = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.fwd_kf_dist, argv,
+                              err_string)) {
+    extra_cfg.fwd_kf_dist = arg_parse_int_helper(&arg, err_string);
+  } else if (arg_match_helper(&arg, &g_av1_codec_arg_defs.two_pass_output, argv,
+                              err_string)) {
+    err = allocate_and_set_string(value, default_extra_cfg.two_pass_output,
+                                  &extra_cfg.two_pass_output, err_string);
   } else {
     match = 0;
     snprintf(err_string, ARG_ERR_MSG_MAX_LEN, "Cannot find aom option %s",
@@ -3196,6 +3757,11 @@
   }
   aom_free(argv[0]);
 
+  if (err != AOM_CODEC_OK) {
+    ctx->base.err_detail = err_string;
+    return err;
+  }
+
   if (strlen(err_string) != 0) {
     ctx->base.err_detail = err_string;
     return AOM_CODEC_INVALID_PARAM;
@@ -3212,9 +3778,8 @@
 static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *const arg = va_arg(args, int *);
-  const AV1_COMP *const cpi = ctx->ppi->cpi;
   if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
-  return av1_get_seq_level_idx(&cpi->common.seq_params, &cpi->level_params,
+  return av1_get_seq_level_idx(&ctx->ppi->seq_params, &ctx->ppi->level_params,
                                arg);
 }
 
@@ -3295,6 +3860,7 @@
   { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
   { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
   { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
+  { AV1E_SET_ENABLE_DIRECTIONAL_INTRA, ctrl_set_enable_directional_intra },
   { AV1E_SET_ENABLE_DIAGONAL_INTRA, ctrl_set_enable_diagonal_intra },
   { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
   { AV1E_SET_ENABLE_OVERLAY, ctrl_set_enable_overlay },
@@ -3329,6 +3895,7 @@
   { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
   { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
   { AV1E_SET_VMAF_MODEL_PATH, ctrl_set_vmaf_model_path },
+  { AV1E_SET_PARTITION_INFO_PATH, ctrl_set_partition_info_path },
   { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
   { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
   { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
@@ -3342,8 +3909,12 @@
   { AV1E_SET_SVC_LAYER_ID, ctrl_set_layer_id },
   { AV1E_SET_SVC_PARAMS, ctrl_set_svc_params },
   { AV1E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
+  { AV1E_SET_SVC_REF_FRAME_COMP_PRED, ctrl_set_svc_ref_frame_comp_pred },
   { AV1E_SET_VBR_CORPUS_COMPLEXITY_LAP, ctrl_set_vbr_corpus_complexity_lap },
   { AV1E_ENABLE_SB_MULTIPASS_UNIT_TEST, ctrl_enable_sb_multipass_unit_test },
+  { AV1E_SET_DV_COST_UPD_FREQ, ctrl_set_dv_cost_upd_freq },
+  { AV1E_SET_EXTERNAL_PARTITION, ctrl_set_external_partition },
+  { AV1E_SET_ENABLE_TX_SIZE_SEARCH, ctrl_set_enable_tx_size_search },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -3361,6 +3932,7 @@
 };
 
 static const aom_codec_enc_cfg_t encoder_usage_cfg[] = {
+#if !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_GOOD_QUALITY,  // g_usage - non-realtime usage
@@ -3381,7 +3953,7 @@
 
       AOM_RC_ONE_PASS,  // g_pass
 
-      19,  // g_lag_in_frames
+      35,  // g_lag_in_frames
 
       0,                // rc_dropframe_thresh
       RESIZE_NONE,      // rc_resize_mode
@@ -3397,7 +3969,7 @@
       AOM_VBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
       { NULL, 0 },  // rc_firstpass_mb_stats_in
-      256,          // rc_target_bandwidth
+      256,          // rc_target_bitrate
       0,            // rc_min_quantizer
       63,           // rc_max_quantizer
       25,           // rc_undershoot_pct
@@ -3431,6 +4003,7 @@
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#endif  // !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_REALTIME,  // g_usage - real-time usage
@@ -3467,7 +4040,7 @@
       AOM_CBR,      // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
       { NULL, 0 },  // rc_firstpass_mb_stats_in
-      256,          // rc_target_bandwidth
+      256,          // rc_target_bitrate
       0,            // rc_min_quantizer
       63,           // rc_max_quantizer
       25,           // rc_undershoot_pct
@@ -3501,6 +4074,7 @@
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#if !CONFIG_REALTIME_ONLY
   {
       // NOLINT
       AOM_USAGE_ALL_INTRA,  // g_usage - all intra usage
@@ -3537,7 +4111,7 @@
       AOM_Q,        // rc_end_usage
       { NULL, 0 },  // rc_twopass_stats_in
       { NULL, 0 },  // rc_firstpass_mb_stats_in
-      256,          // rc_target_bandwidth
+      256,          // rc_target_bitrate
       0,            // rc_min_quantizer
       63,           // rc_max_quantizer
       25,           // rc_undershoot_pct
@@ -3571,6 +4145,7 @@
       { 0, 128, 128, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },  // cfg
   },
+#endif  // !CONFIG_REALTIME_ONLY
 };
 
 // This data structure and function are exported in aom/aomcx.h
@@ -3595,13 +4170,13 @@
   },
   {
       // NOLINT
-      3,                           // 3 cfg
-      encoder_usage_cfg,           // aom_codec_enc_cfg_t
-      encoder_encode,              // aom_codec_encode_fn_t
-      encoder_get_cxdata,          // aom_codec_get_cx_data_fn_t
-      encoder_set_config,          // aom_codec_enc_config_set_fn_t
-      encoder_get_global_headers,  // aom_codec_get_global_headers_fn_t
-      encoder_get_preview          // aom_codec_get_preview_frame_fn_t
+      NELEMENTS(encoder_usage_cfg),  // cfg_count
+      encoder_usage_cfg,             // aom_codec_enc_cfg_t
+      encoder_encode,                // aom_codec_encode_fn_t
+      encoder_get_cxdata,            // aom_codec_get_cx_data_fn_t
+      encoder_set_config,            // aom_codec_enc_config_set_fn_t
+      encoder_get_global_headers,    // aom_codec_get_global_headers_fn_t
+      encoder_get_preview            // aom_codec_get_preview_frame_fn_t
   },
   encoder_set_option  // aom_codec_set_option_fn_t
 };

diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index 1f3f0c4..2708699 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c

@@ -115,14 +115,18 @@
   if (ctx->frame_worker != NULL) {
     AVxWorker *const worker = ctx->frame_worker;
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    AV1Decoder *const pbi = frame_worker_data->pbi;
     aom_get_worker_interface()->end(worker);
-    aom_free(frame_worker_data->pbi->common.tpl_mvs);
-    frame_worker_data->pbi->common.tpl_mvs = NULL;
+    aom_free(pbi->common.tpl_mvs);
+    pbi->common.tpl_mvs = NULL;
     av1_remove_common(&frame_worker_data->pbi->common);
+    av1_free_cdef_buffers(&pbi->common, &pbi->cdef_worker, &pbi->cdef_sync,
+                          pbi->num_workers);
+    av1_free_cdef_sync(&pbi->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
-    av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+    av1_free_restoration_buffers(&pbi->common);
 #endif
-    av1_decoder_remove(frame_worker_data->pbi);
+    av1_decoder_remove(pbi);
     aom_free(frame_worker_data);
 #if CONFIG_MULTITHREAD
     pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
@@ -392,7 +396,7 @@
     pool->release_fb_cb = av1_release_frame_buffer;
 
     if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
 
     pool->cb_priv = &pool->int_frame_buffers;
@@ -527,14 +531,13 @@
   *data = frame_worker_data->data_end;
 
   if (worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   check_resync(ctx, frame_worker_data->pbi);
 
   return AOM_CODEC_OK;
 }
 
-#if CONFIG_INSPECTION
 // This function enables the inspector to inspect non visible frames.
 static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
                                        const uint8_t *data, size_t data_sz,
@@ -552,13 +555,15 @@
       (FrameWorkerData *)ctx->frame_worker->data1;
   AV1Decoder *const pbi = frame_worker_data->pbi;
   AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_INSPECTION
   frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
   frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+#endif
   res = av1_receive_compressed_data(frame_worker_data->pbi, data_sz, &data);
   check_resync(ctx, frame_worker_data->pbi);
 
   if (ctx->frame_worker->had_error)
-    return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+    return update_error_state(ctx, &frame_worker_data->pbi->error);
 
   // Allow extra zero bytes after the frame end
   while (data < data_end) {
@@ -574,7 +579,6 @@
   data2->show_existing = cm->show_existing_frame;
   return res;
 }
-#endif
 
 static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
                                       const uint8_t *data, size_t data_sz,
@@ -589,6 +593,7 @@
   // Release any pending output frames from the previous decoder_decode call.
   // We need to do this even if the decoder is being flushed or the input
   // arguments are invalid.
+  // TODO(aomedia:3131): decoder_inspect should also do this.
   if (ctx->frame_worker) {
     BufferPool *const pool = ctx->buffer_pool;
     lock_buffer_pool(pool);
@@ -629,6 +634,7 @@
   const uint8_t *data_start = data;
   const uint8_t *data_end = data + data_sz;
 
+  // TODO(aomedia:3131): decoder_inspect should also do this.
   if (ctx->is_annexb) {
     // read the size of this temporal unit
     size_t length_of_size;
@@ -711,7 +717,7 @@
 
   grain_img->user_priv = img->user_priv;
   grain_img->fb_priv = fb->priv;
-  if (av1_add_film_grain(grain_params, img, grain_img)) {
+  if (aom_add_film_grain(grain_params, img, grain_img)) {
     pool->release_fb_cb(pool->cb_priv, fb);
     return NULL;
   }
@@ -823,7 +829,7 @@
         aom_image_t *res =
             add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params);
         if (!res) {
-          aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Grain systhesis failed\n");
         }
         *index += 1;  // Advance the iterator to point to the next image
@@ -1091,10 +1097,9 @@
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
-      still_picture_info->is_still_picture =
-          (int)pbi->common.seq_params.still_picture;
+      still_picture_info->is_still_picture = (int)pbi->seq_params.still_picture;
       still_picture_info->is_reduced_still_picture_hdr =
-          (int)(pbi->common.seq_params.reduced_still_picture_hdr);
+          (int)(pbi->seq_params.reduced_still_picture_hdr);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1112,7 +1117,7 @@
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1Decoder *pbi = frame_worker_data->pbi;
-      if (pbi->common.seq_params.sb_size == BLOCK_128X128) {
+      if (pbi->seq_params.sb_size == BLOCK_128X128) {
         *sb_size = AOM_SUPERBLOCK_SIZE_128X128;
       } else {
         *sb_size = AOM_SUPERBLOCK_SIZE_64X64;
@@ -1291,7 +1296,7 @@
       FrameWorkerData *const frame_worker_data =
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
-      *bit_depth = cm->seq_params.bit_depth;
+      *bit_depth = cm->seq_params->bit_depth;
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1327,9 +1332,9 @@
           (FrameWorkerData *)worker->data1;
       const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
 
-      *img_fmt = get_img_format(cm->seq_params.subsampling_x,
-                                cm->seq_params.subsampling_y,
-                                cm->seq_params.use_highbitdepth);
+      *img_fmt = get_img_format(cm->seq_params->subsampling_x,
+                                cm->seq_params->subsampling_y,
+                                cm->seq_params->use_highbitdepth);
       return AOM_CODEC_OK;
     } else {
       return AOM_CODEC_ERROR;
@@ -1378,6 +1383,39 @@
   return AOM_CODEC_INVALID_PARAM;
 }
 
+static aom_codec_err_t ctrl_get_base_q_idx(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.quant_params.base_qindex;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_show_frame_flag(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  int *const arg = va_arg(args, int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.show_frame;
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_order_hint(aom_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  unsigned int *const arg = va_arg(args, unsigned int *);
+  if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+  if (ctx->frame_worker == NULL) return AOM_CODEC_ERROR;
+  FrameWorkerData *const frame_worker_data =
+      (FrameWorkerData *)ctx->frame_worker->data1;
+  *arg = frame_worker_data->pbi->common.current_frame.order_hint;
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   ctx->invert_tile_order = va_arg(args, int);
@@ -1565,7 +1603,9 @@
   { AOMD_GET_SB_SIZE, ctrl_get_sb_size },
   { AOMD_GET_SHOW_EXISTING_FRAME_FLAG, ctrl_get_show_existing_frame_flag },
   { AOMD_GET_S_FRAME_INFO, ctrl_get_s_frame_info },
-
+  { AOMD_GET_SHOW_FRAME_FLAG, ctrl_get_show_frame_flag },
+  { AOMD_GET_BASE_Q_IDX, ctrl_get_base_q_idx },
+  { AOMD_GET_ORDER_HINT, ctrl_get_order_hint },
   CTRL_MAP_END,
 };
 
@@ -1602,4 +1642,36 @@
   NULL  // aom_codec_set_option_fn_t
 };
 
+// Decoder interface for inspecting frame data. It uses decoder_inspect instead
+// of decoder_decode so it only decodes one frame at a time, whether the frame
+// is shown or not.
+aom_codec_iface_t aom_codec_av1_inspect_algo = {
+  "AOMedia Project AV1 Decoder Inspector" VERSION_STRING,
+  AOM_CODEC_INTERNAL_ABI_VERSION,
+  AOM_CODEC_CAP_DECODER |
+      AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER,  // aom_codec_caps_t
+  decoder_init,                             // aom_codec_init_fn_t
+  decoder_destroy,                          // aom_codec_destroy_fn_t
+  decoder_ctrl_maps,                        // aom_codec_ctrl_fn_map_t
+  {
+      // NOLINT
+      decoder_peek_si,    // aom_codec_peek_si_fn_t
+      decoder_get_si,     // aom_codec_get_si_fn_t
+      decoder_inspect,    // aom_codec_decode_fn_t
+      decoder_get_frame,  // aom_codec_get_frame_fn_t
+      decoder_set_fb_fn,  // aom_codec_set_fb_fn_t
+  },
+  {
+      // NOLINT
+      0,
+      NULL,  // aom_codec_enc_cfg_t
+      NULL,  // aom_codec_encode_fn_t
+      NULL,  // aom_codec_get_cx_data_fn_t
+      NULL,  // aom_codec_enc_config_set_fn_t
+      NULL,  // aom_codec_get_global_headers_fn_t
+      NULL   // aom_codec_get_preview_frame_fn_t
+  },
+  NULL  // aom_codec_set_option_fn_t
+};
+
 aom_codec_iface_t *aom_codec_av1_dx(void) { return &aom_codec_av1_dx_algo; }

diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index f5775d3..57dd1b8 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h

@@ -16,8 +16,11 @@
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
 
-static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
-                            void *user_priv) {
+extern aom_codec_iface_t aom_codec_av1_inspect_algo;
+
+static AOM_INLINE void yuvconfig2image(aom_image_t *img,
+                                       const YV12_BUFFER_CONFIG *yv12,
+                                       void *user_priv) {
   /* aom_img_wrap() doesn't allow specifying independent strides for
    * the Y, U, and V planes, nor other alignment adjustments that
    * might be representable by a YV12_BUFFER_CONFIG, so we just
@@ -80,8 +83,8 @@
   img->metadata = NULL;
 }
 
-static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
-                                       YV12_BUFFER_CONFIG *yv12) {
+static AOM_INLINE aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+                                                  YV12_BUFFER_CONFIG *yv12) {
   yv12->y_buffer = img->planes[AOM_PLANE_Y];
   yv12->u_buffer = img->planes[AOM_PLANE_U];
   yv12->v_buffer = img->planes[AOM_PLANE_V];

diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index cd997cd..0b2d765 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c

@@ -17,8 +17,10 @@
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
+#include "av1/common/cdef_block.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/entropymv.h"
+#include "av1/common/thread_common.h"
 
 int av1_get_MBs(int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
@@ -51,6 +53,234 @@
   }
 }
 
+static INLINE void free_cdef_linebuf_conditional(
+    AV1_COMMON *const cm, const size_t *new_linebuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_linebuf_size[plane] != cdef_info->allocated_linebuf_size[plane]) {
+      aom_free(cdef_info->linebuf[plane]);
+      cdef_info->linebuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs_conditional(AV1_COMMON *const cm,
+                                              uint16_t **colbuf,
+                                              uint16_t **srcbuf,
+                                              const size_t *new_colbuf_size,
+                                              const size_t new_srcbuf_size) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (new_srcbuf_size != cdef_info->allocated_srcbuf_size) {
+    aom_free(*srcbuf);
+    *srcbuf = NULL;
+  }
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    if (new_colbuf_size[plane] != cdef_info->allocated_colbuf_size[plane]) {
+      aom_free(colbuf[plane]);
+      colbuf[plane] = NULL;
+    }
+  }
+}
+
+static INLINE void free_cdef_bufs(uint16_t **colbuf, uint16_t **srcbuf) {
+  aom_free(*srcbuf);
+  *srcbuf = NULL;
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(colbuf[plane]);
+    colbuf[plane] = NULL;
+  }
+}
+
+static INLINE void free_cdef_row_sync(AV1CdefRowSync **cdef_row_mt,
+                                      const int num_mi_rows) {
+  if (*cdef_row_mt == NULL) return;
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_);
+    pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_);
+    aom_free((*cdef_row_mt)[row_idx].row_mutex_);
+    aom_free((*cdef_row_mt)[row_idx].row_cond_);
+  }
+#else
+  (void)num_mi_rows;
+#endif  // CONFIG_MULTITHREAD
+  aom_free(*cdef_row_mt);
+  *cdef_row_mt = NULL;
+}
+
+void av1_free_cdef_buffers(AV1_COMMON *const cm,
+                           AV1CdefWorkerData **cdef_worker,
+                           AV1CdefSync *cdef_sync, int num_workers) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  const int num_mi_rows = cdef_info->allocated_mi_rows;
+
+  for (int plane = 0; plane < MAX_MB_PLANE; plane++) {
+    aom_free(cdef_info->linebuf[plane]);
+    cdef_info->linebuf[plane] = NULL;
+  }
+  // De-allocation of column buffer & source buffer (worker_0).
+  free_cdef_bufs(cdef_info->colbuf, &cdef_info->srcbuf);
+
+  if (num_workers < 2) return;
+  if (*cdef_worker != NULL) {
+    for (int idx = num_workers - 1; idx >= 1; idx--) {
+      // De-allocation of column buffer & source buffer for remaining workers.
+      free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+    }
+    aom_free(*cdef_worker);
+    *cdef_worker = NULL;
+  }
+  free_cdef_row_sync(&cdef_sync->cdef_row_mt, num_mi_rows);
+}
+
+static INLINE void alloc_cdef_linebuf(AV1_COMMON *const cm, uint16_t **linebuf,
+                                      const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (linebuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, linebuf[plane],
+                      aom_malloc(cdef_info->allocated_linebuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_bufs(AV1_COMMON *const cm, uint16_t **colbuf,
+                                   uint16_t **srcbuf, const int num_planes) {
+  CdefInfo *cdef_info = &cm->cdef_info;
+  if (*srcbuf == NULL)
+    CHECK_MEM_ERROR(cm, *srcbuf,
+                    aom_memalign(16, cdef_info->allocated_srcbuf_size));
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    if (colbuf[plane] == NULL)
+      CHECK_MEM_ERROR(cm, colbuf[plane],
+                      aom_malloc(cdef_info->allocated_colbuf_size[plane]));
+  }
+}
+
+static INLINE void alloc_cdef_row_sync(AV1_COMMON *const cm,
+                                       AV1CdefRowSync **cdef_row_mt,
+                                       const int num_mi_rows) {
+  if (*cdef_row_mt != NULL) return;
+
+  CHECK_MEM_ERROR(cm, *cdef_row_mt,
+                  aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows));
+#if CONFIG_MULTITHREAD
+  for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) {
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_mutex_)));
+    pthread_mutex_init((*cdef_row_mt)[row_idx].row_mutex_, NULL);
+
+    CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_,
+                    aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_)));
+    pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL);
+
+    (*cdef_row_mt)[row_idx].is_row_done = 0;
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_alloc_cdef_buffers(AV1_COMMON *const cm,
+                            AV1CdefWorkerData **cdef_worker,
+                            AV1CdefSync *cdef_sync, int num_workers,
+                            int init_worker) {
+  const int num_planes = av1_num_planes(cm);
+  size_t new_linebuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_colbuf_size[MAX_MB_PLANE] = { 0 };
+  size_t new_srcbuf_size = 0;
+  CdefInfo *const cdef_info = &cm->cdef_info;
+  // Check for configuration change
+  const int num_mi_rows =
+      (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int is_num_workers_changed =
+      cdef_info->allocated_num_workers != num_workers;
+  const int is_cdef_enabled =
+      cm->seq_params->enable_cdef && !cm->tiles.large_scale;
+
+  // num-bufs=3 represents ping-pong buffers for top linebuf,
+  // followed by bottom linebuf.
+  // ping-pong is to avoid top linebuf over-write by consecutive row.
+  int num_bufs = 3;
+  if (num_workers > 1)
+    num_bufs = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  if (is_cdef_enabled) {
+    // Calculate src buffer size
+    new_srcbuf_size = sizeof(*cdef_info->srcbuf) * CDEF_INBUF_SIZE;
+    for (int plane = 0; plane < num_planes; plane++) {
+      const int shift =
+          plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x;
+      // Calculate top and bottom line buffer size
+      const int luma_stride =
+          ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+      new_linebuf_size[plane] = sizeof(*cdef_info->linebuf) * num_bufs *
+                                (CDEF_VBORDER << 1) * (luma_stride >> shift);
+      // Calculate column buffer size
+      const int block_height =
+          (CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - shift)) * 2 * CDEF_VBORDER;
+      new_colbuf_size[plane] =
+          sizeof(*cdef_info->colbuf[plane]) * block_height * CDEF_HBORDER;
+    }
+  }
+
+  // Free src, line and column buffers for worker 0 in case of reallocation
+  free_cdef_linebuf_conditional(cm, new_linebuf_size);
+  free_cdef_bufs_conditional(cm, cdef_info->colbuf, &cdef_info->srcbuf,
+                             new_colbuf_size, new_srcbuf_size);
+
+  // The flag init_worker indicates if cdef_worker has to be allocated for the
+  // frame. This is passed as 1 always from decoder. At encoder side, it is 0
+  // when called for parallel frames during FPMT (where cdef_worker is shared
+  // across parallel frames) and 1 otherwise.
+  if (*cdef_worker != NULL && init_worker) {
+    if (is_num_workers_changed) {
+      // Free src and column buffers for remaining workers in case of change in
+      // num_workers
+      for (int idx = cdef_info->allocated_num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs((*cdef_worker)[idx].colbuf, &(*cdef_worker)[idx].srcbuf);
+    } else if (num_workers > 1) {
+      // Free src and column buffers for remaining workers in case of
+      // reallocation
+      for (int idx = num_workers - 1; idx >= 1; idx--)
+        free_cdef_bufs_conditional(cm, (*cdef_worker)[idx].colbuf,
+                                   &(*cdef_worker)[idx].srcbuf, new_colbuf_size,
+                                   new_srcbuf_size);
+    }
+  }
+
+  if (cdef_info->allocated_mi_rows != num_mi_rows)
+    free_cdef_row_sync(&cdef_sync->cdef_row_mt, cdef_info->allocated_mi_rows);
+
+  // Store allocated sizes for reallocation
+  cdef_info->allocated_srcbuf_size = new_srcbuf_size;
+  av1_copy(cdef_info->allocated_colbuf_size, new_colbuf_size);
+  av1_copy(cdef_info->allocated_linebuf_size, new_linebuf_size);
+  // Store configuration to check change in configuration
+  cdef_info->allocated_mi_rows = num_mi_rows;
+  cdef_info->allocated_num_workers = num_workers;
+
+  if (!is_cdef_enabled) return;
+
+  // Memory allocation of column buffer & source buffer (worker_0).
+  alloc_cdef_bufs(cm, cdef_info->colbuf, &cdef_info->srcbuf, num_planes);
+  alloc_cdef_linebuf(cm, cdef_info->linebuf, num_planes);
+
+  if (num_workers < 2) return;
+
+  if (init_worker) {
+    if (*cdef_worker == NULL)
+      CHECK_MEM_ERROR(cm, *cdef_worker,
+                      aom_calloc(num_workers, sizeof(**cdef_worker)));
+
+    // Memory allocation of column buffer & source buffer for remaining workers.
+    for (int idx = num_workers - 1; idx >= 1; idx--)
+      alloc_cdef_bufs(cm, (*cdef_worker)[idx].colbuf,
+                      &(*cdef_worker)[idx].srcbuf, num_planes);
+  }
+
+  alloc_cdef_row_sync(cm, &cdef_sync->cdef_row_mt,
+                      cdef_info->allocated_mi_rows);
+}
+
 #if !CONFIG_REALTIME_ONLY
 // Assumes cm->rst_info[p].restoration_unit_size is already initialized
 void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
@@ -86,11 +316,11 @@
   // Now we need to allocate enough space to store the line buffers for the
   // stripes
   const int frame_w = cm->superres_upscaled_width;
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
 
   for (int p = 0; p < num_planes; ++p) {
     const int is_uv = p > 0;
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
     const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
     const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
@@ -168,10 +398,6 @@
   cm->mi_params.free_mi(&cm->mi_params);
 
   av1_free_above_context_buffers(&cm->above_contexts);
-
-#if CONFIG_LPF_MASK
-  av1_free_loop_filter_mask(cm);
-#endif
 }
 
 int av1_alloc_above_context_buffers(CommonContexts *above_contexts,
@@ -275,37 +501,3 @@
 void av1_init_mi_buffers(CommonModeInfoParams *mi_params) {
   mi_params->setup_mi(mi_params);
 }
-
-#if CONFIG_LPF_MASK
-int av1_alloc_loop_filter_mask(AV1_COMMON *cm) {
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-
-  // Each lfm holds bit masks for all the 4x4 blocks in a max
-  // 64x64 (128x128 for ext_partitions) region.  The stride
-  // and rows are rounded up / truncated to a multiple of 16
-  // (32 for ext_partition).
-  cm->lf.lfm_stride =
-      (cm->mi_params.mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
-  cm->lf.lfm_num =
-      ((cm->mi_params.mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
-      cm->lf.lfm_stride;
-  cm->lf.lfm =
-      (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
-  if (!cm->lf.lfm) return 1;
-
-  unsigned int i;
-  for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
-
-  return 0;
-}
-
-void av1_free_loop_filter_mask(AV1_COMMON *cm) {
-  if (cm->lf.lfm == NULL) return;
-
-  aom_free(cm->lf.lfm);
-  cm->lf.lfm = NULL;
-  cm->lf.lfm_num = 0;
-  cm->lf.lfm_stride = 0;
-}
-#endif

diff --git a/av1/common/alloccommon.h b/av1/common/alloccommon.h
index e75c226..147320b 100644
--- a/av1/common/alloccommon.h
+++ b/av1/common/alloccommon.h

@@ -24,6 +24,8 @@
 struct BufferPool;
 struct CommonContexts;
 struct CommonModeInfoParams;
+struct AV1CdefWorker;
+struct AV1CdefSyncData;
 
 void av1_remove_common(struct AV1Common *cm);
 
@@ -36,6 +38,13 @@
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_cdef_buffers(struct AV1Common *const cm,
+                            struct AV1CdefWorker **cdef_worker,
+                            struct AV1CdefSyncData *cdef_sync, int num_workers,
+                            int init_worker);
+void av1_free_cdef_buffers(struct AV1Common *const cm,
+                           struct AV1CdefWorker **cdef_worker,
+                           struct AV1CdefSyncData *cdef_sync, int num_workers);
 #if !CONFIG_REALTIME_ONLY
 void av1_alloc_restoration_buffers(struct AV1Common *cm);
 void av1_free_restoration_buffers(struct AV1Common *cm);
@@ -46,11 +55,6 @@
 
 int av1_get_MBs(int width, int height);
 
-#if CONFIG_LPF_MASK
-int av1_alloc_loop_filter_mask(struct AV1Common *cm);
-void av1_free_loop_filter_mask(struct AV1Common *cm);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/arm/av1_inv_txfm_neon.c b/av1/common/arm/av1_inv_txfm_neon.c
index 2f3567a..4bc9fa2 100644
--- a/av1/common/arm/av1_inv_txfm_neon.c
+++ b/av1/common/arm/av1_inv_txfm_neon.c

@@ -15,13 +15,13 @@
 #include "config/aom_dsp_rtcd.h"
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/enums.h"
 #include "av1/common/idct.h"
 #include "av1/common/arm/av1_inv_txfm_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 // 1D itx types
 typedef enum ATTRIBUTE_PACKED {

diff --git a/av1/common/arm/av1_txfm_neon.c b/av1/common/arm/av1_txfm_neon.c
index 7e3a05a..f955a37 100644
--- a/av1/common/arm/av1_txfm_neon.c
+++ b/av1/common/arm/av1_txfm_neon.c

@@ -14,8 +14,8 @@
 
 #include "config/av1_rtcd.h"
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 
 void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
   assert(!(size % 4));

diff --git a/av1/common/arm/blend_a64_hmask_neon.c b/av1/common/arm/blend_a64_hmask_neon.c
index 7134f18..4639d4c 100644
--- a/av1/common/arm/blend_a64_hmask_neon.c
+++ b/av1/common/arm/blend_a64_hmask_neon.c

@@ -14,10 +14,10 @@
 #include <assert.h>
 
 #include "aom/aom_integer.h"
-#include "aom_dsp/blend.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,

diff --git a/av1/common/arm/blend_a64_vmask_neon.c b/av1/common/arm/blend_a64_vmask_neon.c
index 194e94c..061af74 100644
--- a/av1/common/arm/blend_a64_vmask_neon.c
+++ b/av1/common/arm/blend_a64_vmask_neon.c

@@ -14,10 +14,10 @@
 #include <assert.h>
 
 #include "aom/aom_integer.h"
-#include "aom_dsp/blend.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
 #include "config/aom_dsp_rtcd.h"
 
 void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,

diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 278d72e..f0e4bed 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c

@@ -16,12 +16,12 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
                                       const int16x4_t s2, const int16x4_t s3,

diff --git a/av1/common/arm/jnt_convolve_neon.c b/av1/common/arm/jnt_convolve_neon.c
index 85a5eaa..e0b76a8 100644
--- a/av1/common/arm/jnt_convolve_neon.c
+++ b/av1/common/arm/jnt_convolve_neon.c

@@ -16,11 +16,11 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 #if !defined(__aarch64__)
 static INLINE void compute_avg_4x1(

diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
index 44e0641..3694763 100644
--- a/av1/common/arm/reconinter_neon.c
+++ b/av1/common/arm/reconinter_neon.c

@@ -15,8 +15,8 @@
 
 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"
 

diff --git a/av1/common/arm/resize_neon.c b/av1/common/arm/resize_neon.c
index e42766e..190a3b2 100644
--- a/av1/common/arm/resize_neon.c
+++ b/av1/common/arm/resize_neon.c

@@ -12,10 +12,10 @@
 #include <arm_neon.h>
 #include <assert.h>
 
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/resize.h"
-#include "av1/common/arm/mem_neon.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 #include "config/av1_rtcd.h"
 #include "config/aom_scale_rtcd.h"
 

diff --git a/av1/common/arm/selfguided_neon.c b/av1/common/arm/selfguided_neon.c
index fc404a6..f5eb36c 100644
--- a/av1/common/arm/selfguided_neon.c
+++ b/av1/common/arm/selfguided_neon.c

@@ -17,14 +17,14 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/common.h"
 #include "av1/common/resize.h"
 #include "av1/common/restoration.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 // Constants used for right shift in final_filter calculation.
 #define NB_EVEN 5

diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index a9bb5bc..06e7555 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c

@@ -16,11 +16,11 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
 
 /* Wiener filter 2D
    Apply horizontal filter and store in a temporary buffer. When applying

diff --git a/av1/common/av1_common_int.h b/av1/common/av1_common_int.h
index 2acb500..980f31c 100644
--- a/av1/common/av1_common_int.h
+++ b/av1/common/av1_common_int.h

@@ -29,10 +29,9 @@
 #include "av1/common/restoration.h"
 #include "av1/common/tile_common.h"
 #include "av1/common/timing.h"
-#include "av1/common/odintrin.h"
-#include "av1/encoder/hash_motion.h"
 #include "aom_dsp/grain_synthesis.h"
 #include "aom_dsp/grain_table.h"
+#include "aom_dsp/odintrin.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -135,7 +134,10 @@
   // distance when a very old frame is used as a reference.
   unsigned int display_order_hint;
   unsigned int ref_display_order_hint[INTER_REFS_PER_FRAME];
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   MV_REF *mvs;
   uint8_t *seg_map;
   struct segmentation seg;
@@ -194,12 +196,32 @@
 
 /*!\brief Parameters related to CDEF */
 typedef struct {
-  int cdef_damping;                       /*!< CDEF damping factor */
-  int nb_cdef_strengths;                  /*!< Number of CDEF strength values */
-  int cdef_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for luma */
-  int cdef_uv_strengths[CDEF_MAX_STRENGTHS]; /*!< CDEF strength values for
-                                                chroma */
-  int cdef_bits; /*!< Number of CDEF strength values in bits */
+  //! CDEF column line buffer
+  uint16_t *colbuf[MAX_MB_PLANE];
+  //! CDEF top & bottom line buffer
+  uint16_t *linebuf[MAX_MB_PLANE];
+  //! CDEF intermediate buffer
+  uint16_t *srcbuf;
+  //! CDEF column line buffer sizes
+  size_t allocated_colbuf_size[MAX_MB_PLANE];
+  //! CDEF top and bottom line buffer sizes
+  size_t allocated_linebuf_size[MAX_MB_PLANE];
+  //! CDEF intermediate buffer size
+  size_t allocated_srcbuf_size;
+  //! CDEF damping factor
+  int cdef_damping;
+  //! Number of CDEF strength values
+  int nb_cdef_strengths;
+  //! CDEF strength values for luma
+  int cdef_strengths[CDEF_MAX_STRENGTHS];
+  //! CDEF strength values for chroma
+  int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+  //! Number of CDEF strength values in bits
+  int cdef_bits;
+  //! Number of rows in the frame in 4 pixel
+  int allocated_mi_rows;
+  //! Number of CDEF workers
+  int allocated_num_workers;
 } CdefInfo;
 
 /*!\cond */
@@ -322,6 +344,10 @@
 
   unsigned int order_hint;
   unsigned int display_order_hint;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Frame's level within the hierarchical structure.
+  unsigned int pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   unsigned int frame_number;
   SkipModeInfo skip_mode_info;
   int refresh_frame_flags;  // Which ref frames are overwritten by this frame
@@ -604,12 +630,12 @@
 
   /*!
    * Delta of qindex (from base_qindex) for V plane DC coefficients.
-   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
    */
   int u_ac_delta_q;
   /*!
    * Delta of qindex (from base_qindex) for V plane AC coefficients.
-   * Same as those for U plane if cm->seq_params.separate_uv_delta_q == 0.
+   * Same as those for U plane if cm->seq_params->separate_uv_delta_q == 0.
    */
   int v_ac_delta_q;
 
@@ -730,7 +756,7 @@
   /*!
    * Code and details about current error status.
    */
-  struct aom_internal_error_info error;
+  struct aom_internal_error_info *error;
 
   /*!
    * AV1 allows two types of frame scaling operations:
@@ -782,10 +808,6 @@
   uint8_t superres_scale_denominator;
 
   /*!
-   * If true, buffer removal times are present.
-   */
-  bool buffer_removal_time_present;
-  /*!
    * buffer_removal_times[op_num] specifies the frame removal time in units of
    * DecCT clock ticks counted from the removal time of the last random access
    * point for operating point op_num.
@@ -952,7 +974,7 @@
    * Elements part of the sequence header, that are applicable for all the
    * frames in the video.
    */
-  SequenceHeader seq_params;
+  SequenceHeader *seq_params;
 
   /*!
    * Current CDFs of all the symbols for the current frame.
@@ -984,7 +1006,7 @@
   CommonContexts above_contexts;
 
   /**
-   * \name Signaled when cm->seq_params.frame_id_numbers_present_flag == 1
+   * \name Signaled when cm->seq_params->frame_id_numbers_present_flag == 1
    */
   /**@{*/
   int current_frame_id;         /*!< frame ID for the current frame. */
@@ -1016,20 +1038,12 @@
   int8_t ref_frame_side[REF_FRAMES];
 
   /*!
-   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
-   */
-  unsigned int number_temporal_layers;
-  /*!
    * Temporal layer ID of this frame
    * (in the range 0 ... (number_temporal_layers - 1)).
    */
   int temporal_layer_id;
 
   /*!
-   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
-   */
-  unsigned int number_spatial_layers;
-  /*!
    * Spatial layer ID of this frame
    * (in the range 0 ... (number_spatial_layers - 1)).
    */
@@ -1046,10 +1060,6 @@
   int64_t txcoeff_cost_timer;
   int64_t txcoeff_cost_count;
 #endif  // TXCOEFF_COST_TIMER
-
-#if CONFIG_LPF_MASK
-  int is_decoding;
-#endif  // CONFIG_LPF_MASK
 } AV1_COMMON;
 
 /*!\cond */
@@ -1194,15 +1204,15 @@
 // Returns 1 if this frame might allow mvs from some reference frame.
 static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode &&
-         cm->seq_params.order_hint_info.enable_ref_frame_mvs &&
-         cm->seq_params.order_hint_info.enable_order_hint &&
+         cm->seq_params->order_hint_info.enable_ref_frame_mvs &&
+         cm->seq_params->order_hint_info.enable_order_hint &&
          !frame_is_intra_only(cm);
 }
 
 // Returns 1 if this frame might use warped_motion
 static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
   return !cm->features.error_resilient_mode && !frame_is_intra_only(cm) &&
-         cm->seq_params.enable_warped_motion;
+         cm->seq_params->enable_warped_motion;
 }
 
 static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
@@ -1242,7 +1252,7 @@
 void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
 
 static INLINE int av1_num_planes(const AV1_COMMON *cm) {
-  return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+  return cm->seq_params->monochrome ? 1 : MAX_MB_PLANE;
 }
 
 static INLINE void av1_init_above_context(CommonContexts *above_contexts,
@@ -1281,8 +1291,8 @@
     }
   }
   xd->mi_stride = cm->mi_params.mi_stride;
-  xd->error_info = &cm->error;
-  cfl_init(&xd->cfl, &cm->seq_params);
+  xd->error_info = cm->error;
+  cfl_init(&xd->cfl, cm->seq_params);
 }
 
 static INLINE void set_entropy_context(MACROBLOCKD *xd, int mi_row, int mi_col,
@@ -1564,7 +1574,7 @@
                                           const MACROBLOCKD *xd,
                                           int mi_col_start, int mi_col_end,
                                           const int tile_row) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   const int width = mi_col_end - mi_col_start;
   const int aligned_width =

diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index caa15c2..705345c 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c

@@ -106,8 +106,6 @@
   struct loopfilter *lf = &cm->lf;
   int lvl;
 
-  lf->combine_vert_horz_lf = 1;
-
   // init limits for given sharpness
   update_sharpness(lfi, lf->sharpness_level);
 
@@ -351,8 +349,14 @@
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
   for (int y = 0; y < y_range; y++) {
     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
     for (int x = 0; x < x_range;) {
@@ -376,8 +380,8 @@
       }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      const int use_highbitdepth = cm->seq_params->use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -456,6 +460,84 @@
   }
 }
 
+void av1_filter_block_plane_vert_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  assert(!(y_range % 2));
+  for (int y = 0; y < y_range; y += 2) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14_dual(p, dst_stride, params.mblim, params.lim,
+                                   params.hev_thr, params.mblim, params.lim,
+                                   params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
 void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
@@ -464,8 +546,14 @@
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = (MAX_MIB_SIZE >> scale_vert);
-  const int x_range = (MAX_MIB_SIZE >> scale_horz);
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
   for (int x = 0; x < x_range; x++) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
@@ -489,8 +577,8 @@
       }
 
 #if CONFIG_AV1_HIGHBITDEPTH
-      const int use_highbitdepth = cm->seq_params.use_highbitdepth;
-      const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+      const int use_highbitdepth = cm->seq_params->use_highbitdepth;
+      const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
       switch (params.filter_length) {
         // apply 4-tap filtering
         case 4:
@@ -572,61 +660,26 @@
   }
 }
 
-void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
-                                      const MACROBLOCKD *const xd,
-                                      const int plane,
-                                      const MACROBLOCKD_PLANE *const plane_ptr,
-                                      const uint32_t mi_row,
-                                      const uint32_t mi_col) {
+void av1_filter_block_plane_horz_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
   const uint32_t scale_horz = plane_ptr->subsampling_x;
   const uint32_t scale_vert = plane_ptr->subsampling_y;
   uint8_t *const dst_ptr = plane_ptr->dst.buf;
   const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_params.mi_rows >> scale_vert;
-  const int x_range = cm->mi_params.mi_cols >> scale_horz;
-  for (int y = 0; y < y_range; y++) {
-    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
-    for (int x = 0; x < x_range;) {
-      // inner loop always filter vertical edges in a MI block. If MI size
-      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
-      // If 4x4 transform is used, it will then filter the internal edge
-      //  aligned with a 4x4 block
-      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
-      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
-      uint32_t advance_units;
-      TX_SIZE tx_size;
-      AV1_DEBLOCKING_PARAMETERS params;
-      memset(&params, 0, sizeof(params));
-
-      tx_size =
-          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
-                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
-      if (tx_size == TX_INVALID) {
-        params.filter_length = 0;
-        tx_size = TX_4X4;
-      }
-
-      // advance the destination pointer
-      advance_units = tx_size_wide_unit[tx_size];
-      x += advance_units;
-      p += advance_units * MI_SIZE;
-    }
-  }
-}
-
-void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
-                                      const MACROBLOCKD *const xd,
-                                      const int plane,
-                                      const MACROBLOCKD_PLANE *const plane_ptr,
-                                      const uint32_t mi_row,
-                                      const uint32_t mi_col) {
-  const uint32_t scale_horz = plane_ptr->subsampling_x;
-  const uint32_t scale_vert = plane_ptr->subsampling_y;
-  uint8_t *const dst_ptr = plane_ptr->dst.buf;
-  const int dst_stride = plane_ptr->dst.stride;
-  const int y_range = cm->mi_params.mi_rows >> scale_vert;
-  const int x_range = cm->mi_params.mi_cols >> scale_horz;
-  for (int x = 0; x < x_range; x++) {
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  for (int x = 0; x < x_range; x += 2) {
     uint8_t *p = dst_ptr + x * MI_SIZE;
     for (int y = 0; y < y_range;) {
       // inner loop always filter vertical edges in a MI block. If MI size
@@ -648,6 +701,35 @@
         tx_size = TX_4X4;
       }
 
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14_dual(p, dst_stride, params.mblim, params.lim,
+                                     params.hev_thr, params.mblim, params.lim,
+                                     params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
       // advance the destination pointer
       advance_units = tx_size_high_unit[tx_size];
       y += advance_units;
@@ -655,136 +737,3 @@
     }
   }
 }
-
-static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
-                             MACROBLOCKD *xd, int start, int stop,
-#if CONFIG_LPF_MASK
-                             int is_decoding,
-#endif
-                             int plane_start, int plane_end) {
-  struct macroblockd_plane *pd = xd->plane;
-  const int col_start = 0;
-  const int col_end = cm->mi_params.mi_cols;
-  int mi_row, mi_col;
-  int plane;
-
-#if CONFIG_LPF_MASK
-  if (is_decoding) {
-    cm->is_decoding = is_decoding;
-    for (plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
-                           plane, plane + 1);
-
-      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
-      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
-
-      // apply loop filtering which only goes through buffer once
-      for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
-          av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
-                               plane, plane + 1);
-          av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
-                                              mi_col);
-          if (mi_col - MI_SIZE_64X64 >= 0) {
-            av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                                 mi_col - MI_SIZE_64X64, plane, plane + 1);
-            av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
-                                                mi_col - MI_SIZE_64X64);
-          }
-        }
-        av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
-                             mi_col - MI_SIZE_64X64, plane, plane + 1);
-        av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
-                                            mi_col - MI_SIZE_64X64);
-      }
-    }
-    return;
-  }
-#endif
-
-  for (plane = plane_start; plane < plane_end; plane++) {
-    if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-      break;
-    else if (plane == 1 && !(cm->lf.filter_level_u))
-      continue;
-    else if (plane == 2 && !(cm->lf.filter_level_v))
-      continue;
-
-    if (cm->lf.combine_vert_horz_lf) {
-      // filter all vertical and horizontal edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          // filter vertical edges
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-          // filter horizontal edges
-          if (mi_col - MAX_MIB_SIZE >= 0) {
-            av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
-                                 mi_row, mi_col - MAX_MIB_SIZE, plane,
-                                 plane + 1);
-            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                        mi_col - MAX_MIB_SIZE);
-          }
-        }
-        // filter horizontal edges
-        av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                             mi_col - MAX_MIB_SIZE, plane, plane + 1);
-        av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                    mi_col - MAX_MIB_SIZE);
-      }
-    } else {
-      // filter all vertical edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-        }
-      }
-
-      // filter all horizontal edges in every 128x128 super block
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
-        for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
-          av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
-                                      mi_col);
-        }
-      }
-    }
-  }
-}
-
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                           MACROBLOCKD *xd,
-#if CONFIG_LPF_MASK
-                           int is_decoding,
-#endif
-                           int plane_start, int plane_end, int partial_frame) {
-  int start_mi_row, end_mi_row, mi_rows_to_filter;
-
-  start_mi_row = 0;
-  mi_rows_to_filter = cm->mi_params.mi_rows;
-  if (partial_frame && cm->mi_params.mi_rows > 8) {
-    start_mi_row = cm->mi_params.mi_rows >> 1;
-    start_mi_row &= 0xfffffff8;
-    mi_rows_to_filter = AOMMAX(cm->mi_params.mi_rows / 8, 8);
-  }
-  end_mi_row = start_mi_row + mi_rows_to_filter;
-  av1_loop_filter_frame_init(cm, plane_start, plane_end);
-  loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
-#if CONFIG_LPF_MASK
-                   is_decoding,
-#endif
-                   plane_start, plane_end);
-}

diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index ca16bbe..1bd00b4 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h

@@ -39,47 +39,6 @@
   uint64_t bits[4];
 } FilterMask;
 
-#if CONFIG_LPF_MASK
-// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
-// Each 1 bit represents a position in which we want to apply the loop filter.
-// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
-// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
-// a uint64_t to represent bitmask.
-// Left_ entries refer to whether we apply a filter on the border to the
-// left of the block.   Above_ entries refer to whether or not to apply a
-// filter on the above border.
-// Since each transform is accompanied by a potentially different type of
-// loop filter there is a different entry in the array for each transform size.
-typedef struct {
-  FilterMask left_y[TX_SIZES];
-  FilterMask above_y[TX_SIZES];
-  FilterMask left_u[TX_SIZES];
-  FilterMask above_u[TX_SIZES];
-  FilterMask left_v[TX_SIZES];
-  FilterMask above_v[TX_SIZES];
-
-  // Y plane vertical edge and horizontal edge filter level
-  uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // U plane filter level
-  uint8_t lfl_u_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_u_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // V plane filter level
-  uint8_t lfl_v_ver[MI_SIZE_64X64][MI_SIZE_64X64];
-  uint8_t lfl_v_hor[MI_SIZE_64X64][MI_SIZE_64X64];
-
-  // other info
-  FilterMask skip;
-  FilterMask is_vert_border;
-  FilterMask is_horz_border;
-  // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
-  FilterMask tx_size_ver[2][5];
-  FilterMask tx_size_hor[2][5];
-} LoopFilterMask;
-#endif  // CONFIG_LPF_MASK
-
 struct loopfilter {
   int filter_level[2];
   int filter_level_u;
@@ -96,14 +55,6 @@
 
   // 0 = ZERO_MV, MV
   int8_t mode_deltas[MAX_MODE_LF_DELTAS];
-
-  int combine_vert_horz_lf;
-
-#if CONFIG_LPF_MASK
-  LoopFilterMask *lfm;
-  size_t lfm_num;
-  int lfm_stride;
-#endif  // CONFIG_LPF_MASK
 };
 
 // Need to align this structure so when it is declared and
@@ -139,21 +90,6 @@
 void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
                                 int plane_end);
 
-/*!\brief Apply AV1 loop filter
- *
- * \ingroup in_loop_filter
- * \callgraph
- */
-#if CONFIG_LPF_MASK
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *xd, int is_decoding,
-                           int plane_start, int plane_end, int partial_frame);
-#else
-void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
-                           struct macroblockd *xd, int plane_start,
-                           int plane_end, int partial_frame);
-#endif
-
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
@@ -164,49 +100,23 @@
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
+void av1_filter_block_plane_vert_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
+void av1_filter_block_plane_horz_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
                              int plane, const MB_MODE_INFO *mbmi);
-#if CONFIG_LPF_MASK
-void av1_filter_block_plane_ver(struct AV1Common *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col);
-
-void av1_filter_block_plane_hor(struct AV1Common *const cm,
-                                struct macroblockd_plane *const plane, int pl,
-                                int mi_row, int mi_col);
-
-int get_index_shift(int mi_col, int mi_row, int *index);
-
-void av1_build_bitmask_vert_info(
-    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane);
-
-void av1_build_bitmask_horz_info(
-    struct AV1Common *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane);
-
-void av1_filter_block_plane_bitmask_vert(
-    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
-    int pl, int mi_row, int mi_col);
-
-void av1_filter_block_plane_bitmask_horz(
-    struct AV1Common *const cm, struct macroblockd_plane *const plane_ptr,
-    int pl, int mi_row, int mi_col);
-
-void av1_store_bitmask_univariant_tx(struct AV1Common *cm, int mi_row,
-                                     int mi_col, BLOCK_SIZE bsize,
-                                     MB_MODE_INFO *mbmi);
-
-void av1_store_bitmask_other_info(struct AV1Common *cm, int mi_row, int mi_col,
-                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
-                                  int is_horz_coding_block_border,
-                                  int is_vert_coding_block_border);
-
-void av1_store_bitmask_vartx(struct AV1Common *cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             MB_MODE_INFO *mbmi);
-#endif  // CONFIG_LPF_MASK
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index ea16b43..7f2cd7e 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -15,6 +15,7 @@
  */
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/odintrin.h"
 #include "aom_dsp/txfm_common.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
@@ -22,7 +23,6 @@
 #include "av1/common/filter.h"
 #include "av1/common/convolve.h"
 #include "av1/common/av1_txfm.h"
-#include "av1/common/odintrin.h"
 #include "av1/common/restoration.h"
 
 struct macroblockd;
@@ -214,6 +214,7 @@
 
 add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
 add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/av1_highbd_iwht4x4_16_add  sse4_1/;
 
 add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
 add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -274,6 +275,49 @@
 if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 
   # ENCODEMB INVOKE
+  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
+                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
+  specialize qw/aom_upsampled_pred sse2/;
+  #
+  #
+  #
+  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                   int ref_stride, int subpel_search";
+  specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+  add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+  specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
+
+  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+                                                       int subpel_search";
+  specialize qw/aom_comp_mask_upsampled_pred sse2/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                   const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
+                                                   int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                            const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                            int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
+    specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+                                                                const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+                                                                int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+                                                                int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
+  }
 
   # the transform coefficients are held in 32-bit
   # values, so the assembler code for  av1_block_error can no longer be used.
@@ -281,14 +325,14 @@
   specialize qw/av1_block_error sse2 avx2 neon/;
 
   add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size";
-  specialize qw/av1_block_error_lp avx2 neon/;
+  specialize qw/av1_block_error_lp sse2 avx2 neon/;
 
   add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp sse2 avx2 neon/;
 
-  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan";
-  specialize qw/av1_quantize_lp avx2 neon/;
-
+  # TODO(any): need to fix the bug in neon optimization and re-enable it.
+  add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/av1_quantize_lp sse2 avx2/;
 
   add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/av1_quantize_fp_32x32 neon avx2/;
@@ -302,7 +346,7 @@
   # fdct functions
 
   add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fwht4x4 neon/;
+  specialize qw/av1_fwht4x4 sse4_1 neon/;
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
@@ -383,7 +427,7 @@
   }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_highbd_fwht4x4 neon/;
+  specialize qw/av1_highbd_fwht4x4 sse4_1 neon/;
 
   # End av1_high encoder functions
 
@@ -430,8 +474,11 @@
   specialize qw/av1_get_horver_correlation_full sse4_1 avx2 neon/;
 
   add_proto qw/void av1_nn_predict/, " const float *input_nodes, const NN_CONFIG *const nn_config, int reduce_prec, float *const output";
+
+  add_proto qw/void av1_nn_fast_softmax_16/, " const float *input_nodes, float *output";
   if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") {
     specialize qw/av1_nn_predict sse3 neon/;
+    specialize qw/av1_nn_fast_softmax_16 sse3/;
   }
 
   # CNN functions

diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 00725ea..1d59750 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c

@@ -11,8 +11,6 @@
 
 #include <math.h>
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 

diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 35260bf..b2e72d2 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h

@@ -39,6 +39,12 @@
 
 #define INTERINTRA_WEDGE_SIGN 0
 
+#define DEFAULT_INTER_TX_TYPE DCT_DCT
+
+#define MAX_PALETTE_BLOCK_WIDTH 64
+
+#define MAX_PALETTE_BLOCK_HEIGHT 64
+
 /*!\cond */
 
 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
@@ -320,6 +326,9 @@
   int8_t cdef_strength : 4;
   /**@}*/
 
+  /*! \brief Skip CDEF for this superblock */
+  uint8_t skip_cdef_curr_sb;
+
 #if CONFIG_RD_DEBUG
   /*! \brief RD info used for debugging */
   RD_STATS rd_stats;
@@ -801,7 +810,7 @@
   FRAME_CONTEXT *tile_ctx;
 
   /*!
-   * Bit depth: copied from cm->seq_params.bit_depth for convenience.
+   * Bit depth: copied from cm->seq_params->bit_depth for convenience.
    */
   int bd;
 
@@ -928,13 +937,42 @@
 /*!\cond */
 
 static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+#else
+  (void)xd;
+  return 0;
+#endif
 }
 
 static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+#if CONFIG_AV1_HIGHBITDEPTH
   return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
              ? CONVERT_TO_BYTEPTR(buf16)
              : buf16;
+#else
+  (void)xd;
+  return buf16;
+#endif
+}
+
+typedef struct BitDepthInfo {
+  int bit_depth;
+  /*! Is the image buffer high bit depth?
+   * Low bit depth buffer uses uint8_t.
+   * High bit depth buffer uses uint16_t.
+   * Equivalent to cm->seq_params->use_highbitdepth
+   */
+  int use_highbitdepth_buf;
+} BitDepthInfo;
+
+static INLINE BitDepthInfo get_bit_depth_info(const MACROBLOCKD *xd) {
+  BitDepthInfo bit_depth_info;
+  bit_depth_info.bit_depth = xd->bd;
+  bit_depth_info.use_highbitdepth_buf = is_cur_buf_hbd(xd);
+  assert(IMPLIES(!bit_depth_info.use_highbitdepth_buf,
+                 bit_depth_info.bit_depth == 8));
+  return bit_depth_info;
 }
 
 static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
@@ -1145,7 +1183,7 @@
   if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
       xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
       use_screen_content_tools)
-    return DCT_DCT;
+    return DEFAULT_INTER_TX_TYPE;
 
   return intra_mode_to_tx_type(mbmi, plane_type);
 }
@@ -1467,8 +1505,10 @@
 static INLINE int av1_allow_palette(int allow_screen_content_tools,
                                     BLOCK_SIZE sb_type) {
   assert(sb_type < BLOCK_SIZES_ALL);
-  return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
-         block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
+  return allow_screen_content_tools &&
+         block_size_wide[sb_type] <= MAX_PALETTE_BLOCK_WIDTH &&
+         block_size_high[sb_type] <= MAX_PALETTE_BLOCK_HEIGHT &&
+         sb_type >= BLOCK_8X8;
 }
 
 // Returns sub-sampled dimensions of the given block.

diff --git a/av1/common/cdef.c b/av1/common/cdef.c
index d9b5a10..9ab7d4d 100644
--- a/av1/common/cdef.c
+++ b/av1/common/cdef.c

@@ -21,35 +21,6 @@
 #include "av1/common/cdef_block.h"
 #include "av1/common/reconinter.h"
 
-enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
-
-/*!\brief Parameters related to CDEF Block */
-typedef struct {
-  uint16_t *src;
-  uint8_t *dst;
-  uint16_t *colbuf[MAX_MB_PLANE];
-  cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
-
-  int xdec;
-  int ydec;
-  int mi_wide_l2;
-  int mi_high_l2;
-  int frame_boundary[BOUNDARIES];
-
-  int damping;
-  int coeff_shift;
-  int level;
-  int sec_strength;
-  int cdef_count;
-  int is_zero_level;
-  int dir[CDEF_NBLOCKS][CDEF_NBLOCKS];
-  int var[CDEF_NBLOCKS][CDEF_NBLOCKS];
-
-  int dst_stride;
-  int coffset;
-  int roffset;
-} CdefBlockInfo;
-
 static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
                              int mi_stride) {
   MB_MODE_INFO **mbmi = grid + mi_row * mi_stride + mi_col;
@@ -116,10 +87,10 @@
   }
 }
 
-static void copy_sb8_16(AV1_COMMON *cm, uint16_t *dst, int dstride,
-                        const uint8_t *src, int src_voffset, int src_hoffset,
-                        int sstride, int vsize, int hsize) {
-  if (cm->seq_params.use_highbitdepth) {
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize) {
+  if (cm->seq_params->use_highbitdepth) {
     const uint16_t *base =
         &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
     cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
@@ -151,29 +122,35 @@
 // Inputs:
 //   cm: Pointer to common structure.
 //   fb_info: Pointer to the CDEF block-level parameter structure.
-//   linebuf: Top feedback buffer for CDEF.
+//   colbuf: Left column buffer for CDEF.
 //   cdef_left: Left block is filtered or not.
 //   fbc, fbr: col and row index of a block.
 //   plane: plane index Y/CB/CR.
-//   prev_row_cdef: Top blocks are filtered or not.
 // Returns:
 //   Nothing will be returned.
-static void cdef_prepare_fb(AV1_COMMON *cm, CdefBlockInfo *fb_info,
-                            uint16_t **linebuf, const int *cdef_left, int fbc,
-                            int fbr, uint8_t plane,
-                            unsigned char *prev_row_cdef) {
+static void cdef_prepare_fb(const AV1_COMMON *const cm, CdefBlockInfo *fb_info,
+                            uint16_t **const colbuf, const int *cdef_left,
+                            int fbc, int fbr, int plane) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   uint16_t *src = fb_info->src;
-  const int stride = (mi_params->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(mi_params->mi_cols << MI_SIZE_LOG2, 4);
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   int cstart = 0;
   if (!*cdef_left) cstart = -CDEF_HBORDER;
   int rend, cend;
-  int nhb = AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
-  int nvb = AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
-  int hsize = nhb << fb_info->mi_wide_l2;
-  int vsize = nvb << fb_info->mi_high_l2;
+  const int nhb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_cols - MI_SIZE_64X64 * fbc);
+  const int nvb =
+      AOMMIN(MI_SIZE_64X64, mi_params->mi_rows - MI_SIZE_64X64 * fbr);
+  const int hsize = nhb << fb_info->mi_wide_l2;
+  const int vsize = nvb << fb_info->mi_high_l2;
+  const uint16_t *top_linebuf = fb_info->top_linebuf[plane];
+  const uint16_t *bot_linebuf = fb_info->bot_linebuf[plane];
+  const int bot_offset = (vsize + CDEF_VBORDER) * CDEF_BSTRIDE;
+  const int stride =
+      luma_stride >> (plane == AOM_PLANE_Y ? 0 : cm->seq_params->subsampling_x);
 
   if (fbc == nhfb - 1)
     cend = hsize;
@@ -185,54 +162,55 @@
   else
     rend = vsize + CDEF_VBORDER;
 
-  if (fbc == nhfb - 1) {
-    /* On the last superblock column, fill in the right border with
-    CDEF_VERY_LARGE to avoid filtering with the outside. */
-    fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE, rend + CDEF_VBORDER,
-              hsize + CDEF_HBORDER - cend, CDEF_VERY_LARGE);
-  }
-  if (fbr == nvfb - 1) {
-    /* On the last superblock row, fill in the bottom border with
-    CDEF_VERY_LARGE to avoid filtering with the outside. */
-    fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-              CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-  }
   /* Copy in the pixels we need from the current superblock for
   deringing.*/
-  copy_sb8_16(cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
-              CDEF_BSTRIDE, fb_info->dst, fb_info->roffset,
-              fb_info->coffset + cstart, fb_info->dst_stride, rend,
-              cend - cstart);
-  if (!prev_row_cdef[fbc]) {
-    copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER, fb_info->coffset,
-                fb_info->dst_stride, CDEF_VBORDER, hsize);
-  } else if (fbr > 0) {
-    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset], stride, CDEF_VBORDER, hsize);
+  av1_cdef_copy_sb8_16(
+      cm, &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+      CDEF_BSTRIDE, fb_info->dst, fb_info->roffset, fb_info->coffset + cstart,
+      fb_info->dst_stride, vsize, cend - cstart);
+
+  /* Copy in the pixels we need for the current superblock from bottom buffer.*/
+  if (fbr < nvfb - 1) {
+    copy_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset], stride, CDEF_VBORDER, hsize);
+  } else {
+    fill_rect(&src[bot_offset + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+              hsize, CDEF_VERY_LARGE);
+  }
+  if (fbr < nvfb - 1 && fbc > 0) {
+    copy_rect(&src[bot_offset], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset - CDEF_HBORDER], stride,
+              CDEF_VBORDER, CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset], CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+              CDEF_VERY_LARGE);
+  }
+  if (fbr < nvfb - 1 && fbc < nhfb - 1) {
+    copy_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              &bot_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              CDEF_HBORDER);
+  } else {
+    fill_rect(&src[bot_offset + hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+              CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+  }
+
+  /* Copy in the pixels we need from the current superblock from top buffer.*/
+  if (fbr > 0) {
+    copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &top_linebuf[fb_info->coffset],
+              stride, CDEF_VBORDER, hsize);
   } else {
     fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
               CDEF_VERY_LARGE);
   }
-  if (!prev_row_cdef[fbc - 1]) {
-    copy_sb8_16(cm, src, CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER,
-                fb_info->coffset - CDEF_HBORDER, fb_info->dst_stride,
-                CDEF_VBORDER, CDEF_HBORDER);
-  } else if (fbr > 0 && fbc > 0) {
-    copy_rect(src, CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset - CDEF_HBORDER], stride,
-              CDEF_VBORDER, CDEF_HBORDER);
+  if (fbr > 0 && fbc > 0) {
+    copy_rect(src, CDEF_BSTRIDE, &top_linebuf[fb_info->coffset - CDEF_HBORDER],
+              stride, CDEF_VBORDER, CDEF_HBORDER);
   } else {
     fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
   }
-  if (!prev_row_cdef[fbc + 1]) {
-    copy_sb8_16(cm, &src[CDEF_HBORDER + hsize], CDEF_BSTRIDE, fb_info->dst,
-                fb_info->roffset - CDEF_VBORDER, fb_info->coffset + hsize,
-                fb_info->dst_stride, CDEF_VBORDER, CDEF_HBORDER);
-  } else if (fbr > 0 && fbc < nhfb - 1) {
+  if (fbr > 0 && fbc < nhfb - 1) {
     copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
-              &linebuf[plane][fb_info->coffset + hsize], stride, CDEF_VBORDER,
+              &top_linebuf[fb_info->coffset + hsize], stride, CDEF_VBORDER,
               CDEF_HBORDER);
   } else {
     fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
@@ -241,36 +219,25 @@
   if (*cdef_left) {
     /* If we deringed the superblock on the left then we need to copy in
     saved pixels. */
-    copy_rect(src, CDEF_BSTRIDE, fb_info->colbuf[plane], CDEF_HBORDER,
+    copy_rect(src, CDEF_BSTRIDE, colbuf[plane], CDEF_HBORDER,
               rend + CDEF_VBORDER, CDEF_HBORDER);
   }
   /* Saving pixels in case we need to dering the superblock on the
   right. */
-  copy_rect(fb_info->colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+  copy_rect(colbuf[plane], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
             rend + CDEF_VBORDER, CDEF_HBORDER);
-  copy_sb8_16(cm, &linebuf[plane][fb_info->coffset], stride, fb_info->dst,
-              (MI_SIZE_64X64 << fb_info->mi_high_l2) * (fbr + 1) - CDEF_VBORDER,
-              fb_info->coffset, fb_info->dst_stride, CDEF_VBORDER, hsize);
 
-  if (fb_info->frame_boundary[TOP]) {
-    fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
-              CDEF_VERY_LARGE);
-  }
   if (fb_info->frame_boundary[LEFT]) {
     fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
               CDEF_VERY_LARGE);
   }
-  if (fb_info->frame_boundary[BOTTOM]) {
-    fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
-              CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
-  }
   if (fb_info->frame_boundary[RIGHT]) {
     fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
               vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
   }
 }
 
-static INLINE void cdef_filter_fb(CdefBlockInfo *fb_info, uint8_t plane,
+static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane,
                                   uint8_t use_highbitdepth) {
   int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset;
   if (use_highbitdepth) {
@@ -291,11 +258,11 @@
 }
 
 // Initializes block-level parameters for CDEF.
-static INLINE void cdef_init_fb_col(MACROBLOCKD *xd,
+static INLINE void cdef_init_fb_col(const MACROBLOCKD *const xd,
                                     const CdefInfo *const cdef_info,
-                                    CdefBlockInfo *fb_info,
-                                    const int mbmi_cdef_strength, int fbc,
-                                    int fbr, uint8_t plane) {
+                                    CdefBlockInfo *const fb_info,
+                                    int mbmi_cdef_strength, int fbc, int fbr,
+                                    int plane) {
   if (plane == AOM_PLANE_Y) {
     fb_info->level =
         cdef_info->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
@@ -328,9 +295,9 @@
   fb_info->coffset = MI_SIZE_64X64 * fbc << fb_info->mi_wide_l2;
 }
 
-static bool cdef_fb_col(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                        int fbc, int fbr, int *cdef_left, uint16_t **linebuf,
-                        unsigned char *prev_row_cdef) {
+static void cdef_fb_col(const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+                        CdefBlockInfo *const fb_info, uint16_t **const colbuf,
+                        int *cdef_left, int fbc, int fbr) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mbmi_cdef_strength =
       mi_params
@@ -343,9 +310,9 @@
                               MI_SIZE_64X64 * fbc] == NULL ||
       mbmi_cdef_strength == -1) {
     *cdef_left = 0;
-    return 0;
+    return;
   }
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
+  for (int plane = 0; plane < num_planes; plane++) {
     cdef_init_fb_col(xd, &cm->cdef_info, fb_info, mbmi_cdef_strength, fbc, fbr,
                      plane);
     if (fb_info->is_zero_level ||
@@ -353,20 +320,26 @@
              mi_params, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64,
              fb_info->dlist, BLOCK_64X64)) == 0) {
       *cdef_left = 0;
-      return 0;
+      return;
     }
-    cdef_prepare_fb(cm, fb_info, linebuf, cdef_left, fbc, fbr, plane,
-                    prev_row_cdef);
-    cdef_filter_fb(fb_info, plane, cm->seq_params.use_highbitdepth);
+    cdef_prepare_fb(cm, fb_info, colbuf, cdef_left, fbc, fbr, plane);
+    cdef_filter_fb(fb_info, plane, cm->seq_params->use_highbitdepth);
   }
   *cdef_left = 1;
-  return 1;
 }
 
-static INLINE void cdef_init_fb_row(CdefBlockInfo *fb_info, int mi_rows,
-                                    int fbr) {
-  const int nvfb = (mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  (void)cdef_sync;
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+  const bool ping_pong = fbr & 1;
   // for the current filter block, it's top left corner mi structure (mi_tl)
   // is first accessed to check whether the top and left boundaries are
   // frame boundaries. Then bottom-left and top-right mi structures are
@@ -379,78 +352,58 @@
   fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
   if (fbr != nvfb - 1)
     fb_info->frame_boundary[BOTTOM] =
-        (MI_SIZE_64X64 * (fbr + 1) == mi_rows) ? 1 : 0;
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
   else
     fb_info->frame_boundary[BOTTOM] = 1;
-}
-
-static void cdef_fb_row(AV1_COMMON *cm, MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                        uint16_t **linebuf, int fbr,
-                        unsigned char *curr_row_cdef,
-                        unsigned char *prev_row_cdef) {
-  int cdef_left = 1;
-  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-
-  cdef_init_fb_row(fb_info, cm->mi_params.mi_rows, fbr);
-  for (int fbc = 0; fbc < nhfb; fbc++) {
-    fb_info->frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
-    if (fbc != nhfb - 1)
-      fb_info->frame_boundary[RIGHT] =
-          (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
-    else
-      fb_info->frame_boundary[RIGHT] = 1;
-    curr_row_cdef[fbc] = cdef_fb_col(cm, xd, fb_info, fbc, fbr, &cdef_left,
-                                     linebuf, prev_row_cdef);
-  }
-}
-
-// Initialize the frame-level CDEF parameters.
-// Inputs:
-//   frame: Pointer to input frame buffer.
-//   cm: Pointer to common structure.
-//   xd: Pointer to common current coding block structure.
-//   fb_info: Pointer to the CDEF block-level parameter structure.
-//   src: Intermediate input buffer for CDEF.
-//   colbuf: Left feedback buffer for CDEF.
-//   linebuf: Top feedback buffer for CDEF.
-// Returns:
-//   Nothing will be returned.
-static void cdef_prepare_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                               MACROBLOCKD *xd, CdefBlockInfo *fb_info,
-                               uint16_t *src, uint16_t **colbuf,
-                               uint16_t **linebuf) {
-  const int num_planes = av1_num_planes(cm);
-  const int stride = (cm->mi_params.mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
-                       num_planes);
-
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
-    linebuf[plane] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
-    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
-    const int block_height = (MI_SIZE_64X64 << mi_high_l2) + 2 * CDEF_VBORDER;
-    colbuf[plane] = aom_malloc(
-        sizeof(*colbuf) *
-        ((CDEF_BLOCKSIZE << (MI_SIZE_LOG2 - xd->plane[plane].subsampling_y)) +
-         2 * CDEF_VBORDER) *
-        CDEF_HBORDER);
-    fill_rect(colbuf[plane], CDEF_HBORDER, block_height, CDEF_HBORDER,
-              CDEF_VERY_LARGE);
-    fb_info->colbuf[plane] = colbuf[plane];
-  }
 
   fb_info->src = src;
   fb_info->damping = cm->cdef_info.cdef_damping;
-  fb_info->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
-  memset(fb_info->dir, 0, sizeof(fb_info->dir));
-  memset(fb_info->var, 0, sizeof(fb_info->var));
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+    const int offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    // here ping-pong buffers are maintained for top linebuf
+    // to avoid linebuf over-write by consecutive row.
+    uint16_t *const top_linebuf =
+        &linebuf[plane][ping_pong * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] = &linebuf[plane][(CDEF_VBORDER << 1) * stride];
+
+    if (fbr != nvfb - 1)  // top line buffer copy
+      av1_cdef_copy_sb8_16(cm, top_linebuf, stride, xd->plane[plane].dst.buf,
+                           offset - CDEF_VBORDER, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    fb_info->top_linebuf[plane] =
+        &linebuf[plane][(!ping_pong) * CDEF_VBORDER * stride];
+
+    if (fbr != nvfb - 1)  // bottom line buffer copy
+      av1_cdef_copy_sb8_16(cm, fb_info->bot_linebuf[plane], stride,
+                           xd->plane[plane].dst.buf, offset, 0,
+                           xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+  }
 }
 
-static void cdef_free(unsigned char *row_cdef, uint16_t **colbuf,
-                      uint16_t **linebuf, const int num_planes) {
-  aom_free(row_cdef);
-  for (uint8_t plane = 0; plane < num_planes; plane++) {
-    aom_free(colbuf[plane]);
-    aom_free(linebuf[plane]);
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync) {
+  CdefBlockInfo fb_info;
+  int cdef_left = 1;
+  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+
+  cdef_init_fb_row_fn(cm, xd, &fb_info, linebuf, src, cdef_sync, fbr);
+  for (int fbc = 0; fbc < nhfb; fbc++) {
+    fb_info.frame_boundary[LEFT] = (MI_SIZE_64X64 * fbc == 0) ? 1 : 0;
+    if (fbc != nhfb - 1)
+      fb_info.frame_boundary[RIGHT] =
+          (MI_SIZE_64X64 * (fbc + 1) == cm->mi_params.mi_cols) ? 1 : 0;
+    else
+      fb_info.frame_boundary[RIGHT] = 1;
+    cdef_fb_col(cm, xd, &fb_info, colbuf, &cdef_left, fbc, fbr);
   }
 }
 
@@ -461,29 +414,15 @@
 //   xd: Pointer to common current coding block structure.
 // Returns:
 //   Nothing will be returned.
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
-                    MACROBLOCKD *xd) {
-  DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
-  uint16_t *colbuf[MAX_MB_PLANE] = { NULL };
-  uint16_t *linebuf[MAX_MB_PLANE] = { NULL };
-  CdefBlockInfo fb_info;
-  unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn) {
   const int num_planes = av1_num_planes(cm);
   const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  const int nhfb = (cm->mi_params.mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
 
-  row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
-  memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
-  prev_row_cdef = row_cdef + 1;
-  curr_row_cdef = prev_row_cdef + nhfb + 2;
-  cdef_prepare_frame(frame, cm, xd, &fb_info, src, colbuf, linebuf);
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
 
-  for (int fbr = 0; fbr < nvfb; fbr++) {
-    unsigned char *tmp;
-    cdef_fb_row(cm, xd, &fb_info, linebuf, fbr, curr_row_cdef, prev_row_cdef);
-    tmp = prev_row_cdef;
-    prev_row_cdef = curr_row_cdef;
-    curr_row_cdef = tmp;
-  }
-  cdef_free(row_cdef, colbuf, linebuf, num_planes);
+  for (int fbr = 0; fbr < nvfb; fbr++)
+    av1_cdef_fb_row(cm, xd, cm->cdef_info.linebuf, cm->cdef_info.colbuf,
+                    cm->cdef_info.srcbuf, fbr, cdef_init_fb_row_fn, NULL);
 }

diff --git a/av1/common/cdef.h b/av1/common/cdef.h
index 4d6e600..1941178 100644
--- a/av1/common/cdef.h
+++ b/av1/common/cdef.h

@@ -23,6 +23,40 @@
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cdef_block.h"
 
+enum { TOP, LEFT, BOTTOM, RIGHT, BOUNDARIES } UENUM1BYTE(BOUNDARY);
+
+struct AV1CdefSyncData;
+
+/*!\brief Parameters related to CDEF Block */
+typedef struct {
+  uint16_t *src;                       /*!< CDEF intermediate buffer */
+  uint16_t *top_linebuf[MAX_MB_PLANE]; /*!< CDEF top line buffer */
+  uint16_t *bot_linebuf[MAX_MB_PLANE]; /*!< CDEF bottom line buffer */
+  uint8_t *dst;                        /*!< CDEF destination buffer */
+  cdef_list
+      dlist[MI_SIZE_64X64 * MI_SIZE_64X64]; /*!< CDEF 8x8 block positions */
+
+  int xdec;                       /*!< Sub-sampling X */
+  int ydec;                       /*!< Sub-sampling X */
+  int mi_wide_l2;                 /*!< Pixels per mi unit in width */
+  int mi_high_l2;                 /*!< Pixels per mi unit in height */
+  int frame_boundary[BOUNDARIES]; /*!< frame boundaries */
+
+  int damping;     /*!< CDEF damping factor */
+  int coeff_shift; /*!< Bit-depth based shift for calculating filter strength */
+  int level;       /*!< CDEF filtering level */
+  int sec_strength;  /*!< CDEF secondary strength */
+  int cdef_count;    /*!< Number of CDEF sub-blocks in superblock */
+  int is_zero_level; /*!< CDEF filtering level ON/OFF */
+  int dir[CDEF_NBLOCKS]
+         [CDEF_NBLOCKS]; /*!< CDEF filter direction for all 8x8 sub-blocks*/
+  int var[CDEF_NBLOCKS][CDEF_NBLOCKS]; /*!< variance for all 8x8 sub-blocks */
+
+  int dst_stride; /*!< CDEF destination buffer stride */
+  int coffset;    /*!< current superblock offset in a row */
+  int roffset;    /*!< current row offset */
+} CdefBlockInfo;
+
 static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
 
 static INLINE int constrain(int diff, int threshold, int damping) {
@@ -41,19 +75,36 @@
                              int mi_row, int mi_col, cdef_list *dlist,
                              BLOCK_SIZE bsize);
 
+typedef void (*cdef_init_fb_row_t)(
+    const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+    CdefBlockInfo *const fb_info, uint16_t **const linebuf, uint16_t *const src,
+    struct AV1CdefSyncData *const cdef_sync, int fbr);
+
 /*!\brief Function for applying CDEF to a frame
  *
  * \ingroup in_loop_cdef
  * This function applies CDEF to a frame.
  *
- * \param[in, out]  frame       Compressed frame buffer
- * \param[in, out]  cm          Pointer to top level common structure
- * \param[in]       xd          Pointer to common current coding block structure
+ * \param[in, out]  frame     Compressed frame buffer
+ * \param[in, out]  cm        Pointer to top level common structure
+ * \param[in]       xd        Pointer to common current coding block structure
+ * \param[in]       cdef_init_fb_row_fn   Function Pointer
  *
  * \return Nothing is returned. Instead, the filtered frame is output in
  * \c frame.
  */
-void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *const cm,
+                    MACROBLOCKD *xd, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_fb_row(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                     uint16_t **const linebuf, uint16_t **const colbuf,
+                     uint16_t *const src, int fbr,
+                     cdef_init_fb_row_t cdef_init_fb_row_fn,
+                     struct AV1CdefSyncData *const cdef_sync);
+void av1_cdef_init_fb_row(const AV1_COMMON *const cm,
+                          const MACROBLOCKD *const xd,
+                          CdefBlockInfo *const fb_info,
+                          uint16_t **const linebuf, uint16_t *const src,
+                          struct AV1CdefSyncData *const cdef_sync, int fbr);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/cdef_block.h b/av1/common/cdef_block.h
index 574df2d..27c57dd 100644
--- a/av1/common/cdef_block.h
+++ b/av1/common/cdef_block.h

@@ -12,7 +12,7 @@
 #ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
 #define AOM_AV1_COMMON_CDEF_BLOCK_H_
 
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 #define CDEF_BLOCKSIZE 64
 #define CDEF_BLOCKSIZE_LOG2 6

diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 0062e9f..0d53764 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h

@@ -39,7 +39,7 @@
                                                   const MACROBLOCKD *xd) {
   const MB_MODE_INFO *mbmi = xd->mi[0];
 
-  if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,

diff --git a/av1/common/common.h b/av1/common/common.h
index bed6083..bb74924 100644
--- a/av1/common/common.h
+++ b/av1/common/common.h

@@ -26,8 +26,6 @@
 extern "C" {
 #endif
 
-#define PI 3.141592653589793238462643383279502884
-
 // Only need this for fixed-size arrays, for structs just assign.
 #define av1_copy(dest, src)              \
   {                                      \
@@ -50,7 +48,7 @@
 }
 
 #define CHECK_MEM_ERROR(cm, lval, expr) \
-  AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+  AOM_CHECK_MEM_ERROR(cm->error, lval, expr)
 
 #define AOM_FRAME_MARKER 0x2
 

diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index 402845c..38e1471 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h

@@ -434,9 +434,12 @@
 static const int quant_dist_weight[4][2] = {
   { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
 };
-static const int quant_dist_lookup_table[2][4][2] = {
-  { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
-  { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+
+static const int quant_dist_lookup_table[4][2] = {
+  { 9, 7 },
+  { 11, 5 },
+  { 12, 4 },
+  { 13, 3 },
 };
 
 #ifdef __cplusplus

diff --git a/av1/common/enums.h b/av1/common/enums.h
index a4ef929..eb655c9 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h

@@ -16,6 +16,7 @@
 
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
 #include "aom_ports/mem.h"
 
 #ifdef __cplusplus
@@ -171,33 +172,6 @@
 #define PARTITION_BLOCK_SIZES 5
 #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
 
-// block transform size
-enum {
-  TX_4X4,             // 4x4 transform
-  TX_8X8,             // 8x8 transform
-  TX_16X16,           // 16x16 transform
-  TX_32X32,           // 32x32 transform
-  TX_64X64,           // 64x64 transform
-  TX_4X8,             // 4x8 transform
-  TX_8X4,             // 8x4 transform
-  TX_8X16,            // 8x16 transform
-  TX_16X8,            // 16x8 transform
-  TX_16X32,           // 16x32 transform
-  TX_32X16,           // 32x16 transform
-  TX_32X64,           // 32x64 transform
-  TX_64X32,           // 64x32 transform
-  TX_4X16,            // 4x16 transform
-  TX_16X4,            // 16x4 transform
-  TX_8X32,            // 8x32 transform
-  TX_32X8,            // 32x8 transform
-  TX_16X64,           // 16x64 transform
-  TX_64X16,           // 64x16 transform
-  TX_SIZES_ALL,       // Includes rectangular transforms
-  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
-  TX_SIZES_LARGEST = TX_64X64,
-  TX_INVALID = 255  // Invalid transform size
-} UENUM1BYTE(TX_SIZE);
-
 #define TX_SIZE_LUMA_MIN (TX_4X4)
 /* We don't need to code a transform size unless the allowed size is at least
    one more than the minimum. */
@@ -247,27 +221,6 @@
 } UENUM1BYTE(TX_TYPE_1D);
 
 enum {
-  DCT_DCT,            // DCT in both horizontal and vertical
-  ADST_DCT,           // ADST in vertical, DCT in horizontal
-  DCT_ADST,           // DCT in vertical, ADST in horizontal
-  ADST_ADST,          // ADST in both directions
-  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
-  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
-  FLIPADST_FLIPADST,  // FLIPADST in both directions
-  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
-  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
-  IDTX,               // Identity in both directions
-  V_DCT,              // DCT in vertical, identity in horizontal
-  H_DCT,              // Identity in vertical, DCT in horizontal
-  V_ADST,             // ADST in vertical, identity in horizontal
-  H_ADST,             // Identity in vertical, ADST in horizontal
-  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
-  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
-  TX_TYPES,
-  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
-} UENUM1BYTE(TX_TYPE);
-
-enum {
   REG_REG,
   REG_SMOOTH,
   REG_SHARP,
@@ -279,22 +232,6 @@
   SHARP_SHARP,
 } UENUM1BYTE(DUAL_FILTER_TYPE);
 
-enum {
-  // DCT only
-  EXT_TX_SET_DCTONLY,
-  // DCT + Identity only
-  EXT_TX_SET_DCT_IDTX,
-  // Discrete Trig transforms w/o flip (4) + Identity (1)
-  EXT_TX_SET_DTT4_IDTX,
-  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
-  EXT_TX_SET_DTT4_IDTX_1DDCT,
-  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
-  EXT_TX_SET_DTT9_IDTX_1DDCT,
-  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
-  EXT_TX_SET_ALL16,
-  EXT_TX_SET_TYPES
-} UENUM1BYTE(TxSetType);
-
 #define EXT_TX_SIZES 4       // number of sizes that use extended transforms
 #define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
 #define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
@@ -452,6 +389,13 @@
   UV_MODE_INVALID,  // For uv_mode in inter blocks
 } UENUM1BYTE(UV_PREDICTION_MODE);
 
+// Number of top model rd to store for pruning y modes in intra mode decision
+#define TOP_INTRA_MODEL_COUNT 4
+// Total number of luma intra prediction modes (include both directional and
+// non-directional modes)
+// Because there are 8 directional modes, each has additional 6 delta angles.
+#define LUMA_MODE_COUNT (PAETH_PRED - DC_PRED + 1 + 6 * 8)
+
 enum {
   SIMPLE_TRANSLATION,
   OBMC_CAUSAL,    // 2-sided OBMC
@@ -613,6 +557,9 @@
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
 
+// Select all the decoded frame buffer slots
+#define SELECT_ALL_BUF_SLOTS 0xFF
+
 enum {
   LAST_LAST2_FRAMES,      // { LAST_FRAME, LAST2_FRAME }
   LAST_LAST3_FRAMES,      // { LAST_FRAME, LAST3_FRAME }

diff --git a/av1/common/loopfiltermask.c b/av1/common/loopfiltermask.c
deleted file mode 100644
index 1ae0b11..0000000
--- a/av1/common/loopfiltermask.c
+++ /dev/null

@@ -1,1454 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "av1/common/av1_common_int.h"
-#include "av1/common/av1_loopfilter.h"
-#include "av1/common/reconinter.h"
-#include "av1/common/seg_common.h"
-
-// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the left border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this (-- and | are used for better view)
-//
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    -----------------
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//    10101010|10101010
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-
-// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
-// We use 4 uint64_t to represent the 256 bit.
-// Each 1 represents a position where we should apply a loop filter
-// across the top border of an 4x4 block boundary.
-//
-// In the case of TX_8x8->  ( in low order byte first we end up with
-// a mask that looks like this
-//
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    -----------------
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//    11111111|11111111
-//    00000000|00000000
-//
-// A loopfilter should be applied to every other 4x4 horizontally.
-#if CONFIG_LPF_MASK
-static const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
-  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
-};
-
-static const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
-};
-
-static const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
-};
-
-static const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = {
-  -1, -1, -1, -1, -1, -1, -1, -1, -1, 0,  1,
-  2,  3,  -1, -1, -1, -1, -1, -1, -1, -1, -1
-};
-static const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = {
-  0,  47, 49, 19, 51, 53, 33, 55, 57, 42, 59,
-  60, 46, -1, -1, -1, 61, 62, 63, 64, 65, 66
-};
-
-static const FilterMask left_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
-      0x0055005500550055ULL } },  // block size 32X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
-      0x5555555555555555ULL } },  // block size 64X64, TX_8X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
-      0x0005000500050005ULL } },  // block size 16X64, TX_8X8
-  { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
-      0x0011001100110011ULL } },  // block size 32X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
-      0x1111111111111111ULL } },  // block size 64X64, TX_16X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X16
-  { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 32X64, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
-      0x0101010101010101ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X32
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 32X64, TX_32X64
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
-      0x0001000100010001ULL } },  // block size 16X64, TX_16X64
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-static const FilterMask above_mask_univariant_reordered[67] = {
-  // TX_4X4
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X4, TX_4X4
-  { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_4X4
-  { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_4X4
-  { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_4X4
-  { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_4X4
-  { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
-      0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_4X4
-  { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
-      0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
-  { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_4X4
-  { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_4X4
-  { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_4X4
-  { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
-      0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
-  { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_4X4
-  // TX_8X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X8, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_8X8
-  { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
-      0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
-      0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
-  { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_8X8
-  { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
-      0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
-  { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_8X8
-  // TX_16X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X16, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_16X16
-  { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
-      0x00000000000000ffULL } },  // block size 32X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_16X16
-  { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
-      0x000000000000ffffULL } },  // block size 64X64, TX_16X16
-  { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
-      0x000000000000000fULL } },  // block size 16X64, TX_16X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_16X16
-  // TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X32, TX_32X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_32X32
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_32X32
-  // TX_64X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X64, TX_64X64
-  // 2:1, 1:2 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X8, TX_4X8
-  { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X8
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X4, TX_8X4
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_8X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X16, TX_8X16
-  { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X8, TX_16X8
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_16X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X32, TX_16X32
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X16, TX_32X16
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_32X16
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X64, TX_32X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X32, TX_64X32
-  // 4:1, 1:4 transform sizes.
-  { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 4X16, TX_4X16
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X4, TX_16X4
-  { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 8X32, TX_8X32
-  { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 32X8, TX_32X8
-  { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 16X64, TX_16X64
-  { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
-      0x0000000000000000ULL } },  // block size 64X16, TX_64X16
-};
-
-static LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm,
-                                            int mi_row, int mi_col) {
-  assert(cm->lf.lfm != NULL);
-  const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
-  const int col = mi_col >> MIN_MIB_SIZE_LOG2;
-  return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
-}
-
-typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
-                        const uint8_t *limit, const uint8_t *thresh);
-
-typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
-                            const uint8_t *limit0, const uint8_t *thresh0,
-                            const uint8_t *blimit1, const uint8_t *limit1,
-                            const uint8_t *thresh1);
-
-typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh, int bd);
-
-typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1, int bd);
-// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
-// Every 4 rows is represented by one uint64_t mask. Hence,
-// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
-//
-// Given a location by (mi_col, mi_row), This function returns the index
-// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
-//
-// For example, mi_row is the offset of pixels in mi size (4),
-// (mi_row / 4) returns which uint64_t.
-// After locating which uint64_t, mi_row % 4 is the
-// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
-// Therefore, shift = (row << stride_log2) + mi_col;
-int get_index_shift(int mi_col, int mi_row, int *index) {
-  // *index = mi_row >> 2;
-  // rows = mi_row % 4;
-  // stride_log2 = 4;
-  // shift = (rows << stride_log2) + mi_col;
-  *index = mi_row >> 2;
-  return ((mi_row & 3) << 4) | mi_col;
-}
-
-static void filter_selectively_vert_row2(
-    int subsampling_factor, uint8_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                     lfi1->hev_thr);
-          }
-        } else if (mask_16x16_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          } else {
-            aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                    lfi1->hev_thr);
-          }
-        } else if (mask_8x8_0 & 1) {
-          lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                       lfi1->hev_thr);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                  lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                  lfi1->hev_thr);
-        } else if (mask_4x4_0 & 1) {
-          aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
-        } else {
-          aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                             lfi1->hev_thr);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(
-    int subsampling_factor, uint16_t *s, int pitch, int plane,
-    uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
-    uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
-    const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
-  uint64_t mask;
-  const int step = 1 << subsampling_factor;
-
-  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
-              mask_8x8_1 | mask_4x4_1;
-       mask; mask >>= step) {
-    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
-    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
-
-    if (mask & 1) {
-      if ((mask_16x16_0 | mask_16x16_1) & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
-
-        if ((mask_16x16_0 & mask_16x16_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                            lfi0->hev_thr, lfi1->mblim,
-                                            lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_16x16_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_8x8_0 | mask_8x8_1) & 1) {
-        HbdLpfFunc highbd_lpf_vertical =
-            plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
-
-        if ((mask_8x8_0 & mask_8x8_1) & 1) {
-          if (plane) {
-            aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                           lfi0->hev_thr, lfi1->mblim,
-                                           lfi1->lim, lfi1->hev_thr, bd);
-          }
-        } else if (mask_8x8_0 & 1) {
-          highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
-                              bd);
-        } else {
-          highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
-                              lfi1->hev_thr, bd);
-        }
-      }
-
-      if ((mask_4x4_0 | mask_4x4_1) & 1) {
-        if ((mask_4x4_0 & mask_4x4_1) & 1) {
-          aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
-                                         lfi0->hev_thr, lfi1->mblim, lfi1->lim,
-                                         lfi1->hev_thr, bd);
-        } else if (mask_4x4_0 & 1) {
-          aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
-                                    lfi0->hev_thr, bd);
-        } else {
-          aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
-                                    lfi1->lim, lfi1->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4;
-    lfl += step;
-    lfl2 += step;
-    mask_16x16_0 >>= step;
-    mask_8x8_0 >>= step;
-    mask_4x4_0 >>= step;
-    mask_16x16_1 >>= step;
-    mask_8x8_1 >>= step;
-    mask_4x4_1 >>= step;
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
-                                     int subsampling, uint64_t mask_16x16,
-                                     uint64_t mask_8x8, uint64_t mask_4x4,
-                                     const loop_filter_info_n *lfi_n,
-                                     const uint8_t *lfl) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                       lfi->hev_thr, lfin->mblim, lfin->lim,
-                                       lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_8x8 & 1) {
-        // chroma plane filters less pixels introduced in deblock_13tap
-        // experiment
-        LpfFunc lpf_horizontal =
-            plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          } else {
-            aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, lfin->mblim, lfin->lim,
-                                      lfin->hev_thr);
-          }
-          count = 2;
-        } else {
-          lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, lfin->mblim, lfin->lim,
-                                    lfin->hev_thr);
-          count = 2;
-        } else {
-          aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-
-#if CONFIG_AV1_HIGHBITDEPTH
-static void highbd_filter_selectively_horiz(
-    uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
-    uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
-    uint8_t *lfl, int bd) {
-  uint64_t mask;
-  int count;
-  const int step = 1 << subsampling;
-  const unsigned int two_block_mask = subsampling ? 5 : 3;
-  int offset = 0;
-
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
-    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
-    // Next block's thresholds, when it is within current 64x64 block.
-    // If it is out of bound, its mask is zero, and it points to current edge's
-    // filter parameters, instead of next edge's.
-    int next_edge = step;
-    if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
-    const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
-
-    count = 1;
-    if (mask & 1) {
-      if (mask_16x16 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
-
-        if ((mask_16x16 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
-                                              lfi->hev_thr, lfin->mblim,
-                                              lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_8x8 & 1) {
-        HbdLpfFunc highbd_lpf_horizontal =
-            plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
-
-        if ((mask_8x8 & two_block_mask) == two_block_mask) {
-          if (plane) {
-            aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          } else {
-            aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                               lfi->hev_thr, lfin->mblim,
-                                               lfin->lim, lfin->hev_thr, bd);
-          }
-          count = 2;
-        } else {
-          highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
-                                bd);
-        }
-      } else if (mask_4x4 & 1) {
-        if ((mask_4x4 & two_block_mask) == two_block_mask) {
-          aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
-                                             lfi->hev_thr, lfin->mblim,
-                                             lfin->lim, lfin->hev_thr, bd);
-          count = 2;
-        } else {
-          aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, bd);
-        }
-      }
-    }
-
-    s += 4 * count;
-    lfl += step * count;
-    mask_16x16 >>= step * count;
-    mask_8x8 >>= step * count;
-    mask_4x4 >>= step * count;
-    offset += step * count;
-  }
-}
-#endif  // CONFIG_AV1_HIGHBITDEPTH
-
-void av1_build_bitmask_vert_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r++) {
-    const int mi_row = r << subsampling_y;
-    const int row = mi_row % MI_SIZE_64X64;
-    const int row_uv = row | subsampling_y;
-    int index = 0;
-    const int shift = get_index_shift(0, row, &index);
-
-    for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
-         c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
-      const int mi_col = c << subsampling_x;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int col_in_unit = 0;
-           col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
-        const int x = (c + col_in_unit) << MI_SIZE_LOG2;
-        if (x >= plane_ptr->dst.width) break;
-        const int col = col_in_unit << subsampling_x;
-        const int col_uv = col | subsampling_x;
-        const uint64_t mask = ((uint64_t)1 << (shift | col));
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((c + col_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-          switch (plane) {
-            case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        col_in_unit += tx_size_wide_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_build_bitmask_horz_info(
-    AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
-    int plane) {
-  const int subsampling_x = plane_ptr->subsampling_x;
-  const int subsampling_y = plane_ptr->subsampling_y;
-  const int is_uv = plane > 0;
-  TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
-  uint8_t level, prev_level = 1;
-  uint64_t skip, prev_skip = 0;
-  uint64_t is_coding_block_border;
-
-  for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c++) {
-    const int mi_col = c << subsampling_x;
-    const int col = mi_col % MI_SIZE_64X64;
-    const int col_uv = col | subsampling_x;
-
-    for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
-         r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
-      const int mi_row = r << subsampling_y;
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-
-      for (int r_in_unit = 0;
-           r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
-        const int y = (r + r_in_unit) << MI_SIZE_LOG2;
-        if (y >= plane_ptr->dst.height) break;
-        const int row = r_in_unit << subsampling_y;
-        const int row_uv = row | subsampling_y;
-        int index = 0;
-        const int shift = get_index_shift(col, row, &index);
-        const uint64_t mask = ((uint64_t)1 << shift);
-        skip = lfm->skip.bits[index] & mask;
-        is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
-        switch (plane) {
-          case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
-          case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
-          case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
-          default: assert(plane >= 0 && plane <= 2); return;
-        }
-        for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
-          if (is_uv && ts == TX_64X64) continue;
-          if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
-            tx_size = ts;
-            break;
-          }
-        }
-        if ((r + r_in_unit > 0) && (level || prev_level) &&
-            (!prev_skip || !skip || is_coding_block_border)) {
-          const TX_SIZE min_tx_size =
-              AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
-          const int shift_1 = get_index_shift(col_uv, row_uv, &index);
-          const uint64_t mask_1 = ((uint64_t)1 << shift_1);
-
-          switch (plane) {
-            case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
-            case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
-            case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
-            default: assert(plane >= 0 && plane <= 2); return;
-          }
-          if (level == 0 && prev_level != 0) {
-            switch (plane) {
-              case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
-              case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
-              case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
-              default: assert(plane >= 0 && plane <= 2); return;
-            }
-          }
-        }
-
-        // update prev info
-        prev_level = level;
-        prev_skip = skip;
-        prev_tx_size = tx_size;
-        // advance
-        r_in_unit += tx_size_high_unit[tx_size];
-      }
-    }
-  }
-}
-
-void av1_filter_block_plane_bitmask_vert(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int two_row_step = 2 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  const int two_row_stride = row_stride << 1;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-
-  // 1. vertical filtering. filter two rows at a time
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += two_row_step) {
-    const int row = r | ssy;
-    const int row_next = row + row_step;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    int index_next = 0;
-    const int shift_next = get_index_shift(col, row_next, &index_next);
-    const int has_next_row = row_next < cm->mi_params.mi_rows;
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_ver[row][col];
-        lfl2 = &lfm->lfl_y_ver[row_next][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_ver[row][col];
-        lfl2 = &lfm->lfl_u_ver[row_next][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_ver[row][col];
-        lfl2 = &lfm->lfl_v_ver[row_next][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-    uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-    uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-    uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-    uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-    uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-    if (!has_next_row) {
-      mask_16x16_1 = 0;
-      mask_8x8_1 = 0;
-      mask_4x4_1 = 0;
-    }
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_vert_row2(
-          ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-          mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-          &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#else
-    filter_selectively_vert_row2(
-        ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-        mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#endif
-    dst->buf += two_row_stride;
-  }
-  // reset buf pointer for horizontal filtering
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_bitmask_horz(
-    AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
-    int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  uint8_t *const buf0 = dst->buf;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int row_step = 1 << ssy;
-  const int row_stride = dst->stride << MI_SIZE_LOG2;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  assert(lfm);
-  for (int r = 0;
-       ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
-       r += row_step) {
-    if (mi_row + r == 0) {
-      dst->buf += row_stride;
-      continue;
-    }
-    const int row = r | ssy;
-    const int col = ssx;
-    int index = 0;
-    const int shift = get_index_shift(col, row, &index);
-    switch (pl) {
-      case 0:
-        mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-        lfl = &lfm->lfl_y_hor[row][col];
-        break;
-      case 1:
-        mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-        lfl = &lfm->lfl_u_hor[row][col];
-        break;
-      case 2:
-        mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-        mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-        mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-        lfl = &lfm->lfl_v_hor[row][col];
-        break;
-      default: assert(pl >= 0 && pl <= 2); return;
-    }
-    mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-    mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-    mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
-      highbd_filter_selectively_horiz(
-          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
-          mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
-    else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#else
-    filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                             mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#endif
-    dst->buf += row_stride;
-  }
-  // reset buf pointer for next block
-  dst->buf = buf0;
-}
-
-void av1_filter_block_plane_ver(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int single_step = 1 << ssy;
-  const int r_step = 2 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-  uint8_t *lfl2;
-
-  // filter two rows at a time
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      // current and next row should belong to the same mask_idx and index
-      // next row's shift
-      const int row_next = row + single_step;
-      int index_next = 0;
-      const int shift_next = get_index_shift(col, row_next, &index_next);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->left_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_ver[row][col];
-          lfl2 = &lfm->lfl_y_ver[row_next][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->left_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_ver[row][col];
-          lfl2 = &lfm->lfl_u_ver[row_next][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->left_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->left_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->left_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_ver[row][col];
-          lfl2 = &lfm->lfl_v_ver[row_next][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
-      uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
-      uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
-      uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
-      uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
-      uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_vert_row2(
-            ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
-            mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
-            &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
-                                     mask_16x16_0, mask_8x8_0, mask_4x4_0,
-                                     mask_16x16_1, mask_8x8_1, mask_4x4_1,
-                                     &cm->lf_info, lfl, lfl2);
-#else
-      filter_selectively_vert_row2(
-          ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
-          mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
-#endif
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += 2 * MI_SIZE * dst->stride;
-  }
-}
-
-void av1_filter_block_plane_hor(AV1_COMMON *const cm,
-                                struct macroblockd_plane *const plane_ptr,
-                                int pl, int mi_row, int mi_col) {
-  struct buf_2d *const dst = &plane_ptr->dst;
-  int r, c;
-  const int ssx = plane_ptr->subsampling_x;
-  const int ssy = plane_ptr->subsampling_y;
-  const int mask_cutoff = 0xffff;
-  const int r_step = 1 << ssy;
-  uint64_t mask_16x16 = 0;
-  uint64_t mask_8x8 = 0;
-  uint64_t mask_4x4 = 0;
-  uint8_t *lfl;
-
-  for (r = 0; r < cm->seq_params.mib_size &&
-              ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
-       r += r_step) {
-    for (c = 0; c < cm->seq_params.mib_size &&
-                ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
-         c += MI_SIZE_64X64) {
-      if (mi_row + r == 0) continue;
-
-      dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
-      LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
-      assert(lfm);
-      const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
-      const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
-      int index = 0;
-      const int shift = get_index_shift(col, row, &index);
-      switch (pl) {
-        case 0:
-          mask_16x16 = lfm->above_y[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_y[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_y[TX_4X4].bits[index];
-          lfl = &lfm->lfl_y_hor[row][col];
-          break;
-        case 1:
-          mask_16x16 = lfm->above_u[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_u[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_u[TX_4X4].bits[index];
-          lfl = &lfm->lfl_u_hor[row][col];
-          break;
-        case 2:
-          mask_16x16 = lfm->above_v[TX_16X16].bits[index];
-          mask_8x8 = lfm->above_v[TX_8X8].bits[index];
-          mask_4x4 = lfm->above_v[TX_4X4].bits[index];
-          lfl = &lfm->lfl_v_hor[row][col];
-          break;
-        default: assert(pl >= 0 && pl <= 2); return;
-      }
-      mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
-      mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
-      mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
-
-#if CONFIG_AV1_HIGHBITDEPTH
-      if (cm->seq_params.use_highbitdepth)
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride, pl, ssx, mask_16x16,
-                                        mask_8x8, mask_4x4, &cm->lf_info, lfl,
-                                        (int)cm->seq_params.bit_depth);
-      else
-        filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                                 mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#else
-      filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
-                               mask_8x8, mask_4x4, &cm->lf_info, lfl);
-#endif
-      dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
-    }
-    dst->buf += MI_SIZE * dst->stride;
-  }
-}
-
-void av1_store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             MB_MODE_INFO *mbmi) {
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (tx_size > TX_64X64) & (tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
-    mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (tx_size - TX_4X16);
-  }
-  int index = 0;
-  const int row = mi_row % MI_SIZE_64X64;
-  const int col = mi_col % MI_SIZE_64X64;
-  const int shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-void av1_store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
-  // Use a lookup table that provides one bitmask for a given block size and
-  // a univariant transform size.
-  int index;
-  int shift;
-  int row;
-  int col;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
-  const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
-  const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
-  const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
-      mbmi->bsize, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y)];
-  const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
-  int mask_id = 0;
-  int offset = 0;
-  const int half_ratio_tx_size_max32 =
-      (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
-  if (is_square_transform_size) {
-    switch (mbmi->tx_size) {
-      case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
-      case TX_8X8:
-        mask_id = mask_id_table_tx_8x8[bsize];
-        offset = 19;
-        break;
-      case TX_16X16:
-        mask_id = mask_id_table_tx_16x16[bsize];
-        offset = 33;
-        break;
-      case TX_32X32:
-        mask_id = mask_id_table_tx_32x32[bsize];
-        offset = 42;
-        break;
-      case TX_64X64: mask_id = 46; break;
-      default: assert(!is_square_transform_size); return;
-    }
-    mask_id += offset;
-  } else if (half_ratio_tx_size_max32) {
-    int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
-    mask_id =
-        47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
-  } else if (mbmi->tx_size == TX_32X64) {
-    mask_id = 59;
-  } else if (mbmi->tx_size == TX_64X32) {
-    mask_id = 60;
-  } else {  // quarter ratio tx size
-    mask_id = 61 + (mbmi->tx_size - TX_4X16);
-  }
-  row = mi_row % MI_SIZE_64X64;
-  col = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col, row, &index);
-  const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
-  for (int i = 0; i + index < 4; ++i) {
-    // y vertical.
-    lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // y horizontal.
-    lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-    // u/v vertical.
-    lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
-        (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
-    // u/v horizontal.
-    lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
-        (above_mask_univariant_reordered[mask_id].bits[i] << shift);
-  }
-}
-
-void av1_store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
-                                  BLOCK_SIZE bsize, MB_MODE_INFO *mbmi,
-                                  int is_horz_coding_block_border,
-                                  int is_vert_coding_block_border) {
-  int index;
-  int shift;
-  int row;
-  LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
-  const int row_start = mi_row % MI_SIZE_64X64;
-  const int col_start = mi_col % MI_SIZE_64X64;
-  shift = get_index_shift(col_start, row_start, &index);
-  if (is_horz_coding_block_border) {
-    const int block_shift = shift + mi_size_wide[bsize];
-    assert(block_shift <= 64);
-    const uint64_t right_edge_shift =
-        (block_shift == 64) ? 0xffffffffffffffff : ((uint64_t)1 << block_shift);
-    const uint64_t left_edge_shift = (block_shift == 64)
-                                         ? (((uint64_t)1 << shift) - 1)
-                                         : ((uint64_t)1 << shift);
-    assert(right_edge_shift > left_edge_shift);
-    const uint64_t top_edge_mask = right_edge_shift - left_edge_shift;
-    lfm->is_horz_border.bits[index] |= top_edge_mask;
-  }
-  if (is_vert_coding_block_border) {
-    const int is_vert_border = mask_id_table_vert_border[bsize];
-    const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->is_vert_border.bits[i + index] |=
-          (left_mask_univariant_reordered[is_vert_border].bits[i]
-           << vert_shift);
-    }
-  }
-  const int is_skip = is_inter_block(mbmi) && mbmi->skip_mode;
-  if (is_skip) {
-    const int is_skip_mask = mask_id_table_tx_4x4[bsize];
-    for (int i = 0; i + index < 4; ++i) {
-      lfm->skip.bits[i + index] |=
-          (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
-    }
-  }
-  const uint8_t level_vert_y =
-      av1_get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
-  const uint8_t level_horz_y =
-      av1_get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
-  const uint8_t level_u = av1_get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
-  const uint8_t level_v = av1_get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
-  for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
-    index = 0;
-    row = r % MI_SIZE_64X64;
-    memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_ver[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_u_hor[row][col_start], level_u,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_ver[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-    memset(&lfm->lfl_v_hor[row][col_start], level_v,
-           sizeof(uint8_t) * mi_size_wide[bsize]);
-  }
-}
-#endif  // CONFIG_LPF_MASK

diff --git a/av1/common/mv.h b/av1/common/mv.h
index be539e8..3203bf7 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h

@@ -12,6 +12,8 @@
 #ifndef AOM_AV1_COMMON_MV_H_
 #define AOM_AV1_COMMON_MV_H_
 
+#include <stdlib.h>
+
 #include "av1/common/common.h"
 #include "av1/common/common_data.h"
 #include "aom_dsp/aom_filter.h"

diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 04e050a..837df2c 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c

@@ -258,7 +258,7 @@
 
 static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                          int mi_row, int mi_col, int bs) {
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int mask_row = mi_row & (sb_mi_size - 1);
   const int mask_col = mi_col & (sb_mi_size - 1);
 
@@ -347,7 +347,7 @@
   const int cur_frame_index = cm->cur_frame->order_hint;
   const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
   const int frame0_index = buf_0->order_hint;
-  const int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_offset_0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                              cur_frame_index, frame0_index);
   int idx;
   const int allow_high_precision_mv = cm->features.allow_high_precision_mv;
@@ -380,7 +380,7 @@
     // Process compound inter mode
     const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
     const int frame1_index = buf_1->order_hint;
-    const int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
+    const int cur_offset_1 = get_relative_dist(&cm->seq_params->order_hint_info,
                                                cur_frame_index, frame1_index);
     int_mv comp_refmv;
     get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
@@ -838,7 +838,9 @@
 void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
   cm->cur_frame->order_hint = cm->current_frame.order_hint;
   cm->cur_frame->display_order_hint = cm->current_frame.display_order_hint;
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->cur_frame->pyramid_level = cm->current_frame.pyramid_level;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
@@ -854,10 +856,10 @@
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
-    if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
+    if (cm->seq_params->order_hint_info.enable_order_hint && buf != NULL) {
       const int ref_order_hint = buf->order_hint;
       cm->ref_frame_sign_bias[ref_frame] =
-          (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
+          (get_relative_dist(&cm->seq_params->order_hint_info, ref_order_hint,
                              (int)cm->current_frame.order_hint) <= 0)
               ? 0
               : 1;
@@ -930,10 +932,10 @@
       &start_frame_buf->ref_order_hints[0];
   const int cur_order_hint = cm->cur_frame->order_hint;
   int start_to_current_frame_offset = get_relative_dist(
-      &cm->seq_params.order_hint_info, start_frame_order_hint, cur_order_hint);
+      &cm->seq_params->order_hint_info, start_frame_order_hint, cur_order_hint);
 
   for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
-    ref_offset[rf] = get_relative_dist(&cm->seq_params.order_hint_info,
+    ref_offset[rf] = get_relative_dist(&cm->seq_params->order_hint_info,
                                        start_frame_order_hint,
                                        ref_order_hints[rf - LAST_FRAME]);
   }
@@ -980,12 +982,34 @@
   return 1;
 }
 
-void av1_setup_motion_field(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+// cm->ref_frame_side is calculated here, and will be used in
+// av1_copy_frame_mvs() to affect how mvs are copied.
+void av1_calculate_ref_frame_side(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
 
   memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
   if (!order_hint_info->enable_order_hint) return;
 
+  const int cur_order_hint = cm->cur_frame->order_hint;
+
+  for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+    int order_hint = 0;
+
+    if (buf != NULL) order_hint = buf->order_hint;
+
+    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
+      cm->ref_frame_side[ref_frame] = 1;
+    else if (order_hint == cur_order_hint)
+      cm->ref_frame_side[ref_frame] = -1;
+  }
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
+
+  if (!order_hint_info->enable_order_hint) return;
+
   TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
   int size = ((cm->mi_params.mi_rows + MAX_MIB_SIZE) >> 1) *
              (cm->mi_params.mi_stride >> 1);
@@ -995,7 +1019,6 @@
   }
 
   const int cur_order_hint = cm->cur_frame->order_hint;
-
   const RefCntBuffer *ref_buf[INTER_REFS_PER_FRAME];
   int ref_order_hint[INTER_REFS_PER_FRAME];
 
@@ -1008,11 +1031,6 @@
 
     ref_buf[ref_idx] = buf;
     ref_order_hint[ref_idx] = order_hint;
-
-    if (get_relative_dist(order_hint_info, order_hint, cur_order_hint) > 0)
-      cm->ref_frame_side[ref_frame] = 1;
-    else if (order_hint == cur_order_hint)
-      cm->ref_frame_side[ref_frame] = -1;
   }
 
   int ref_stamp = MFMV_STACK_SIZE - 1;
@@ -1219,7 +1237,7 @@
 }
 
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
-  const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
+  const OrderHintInfo *const order_hint_info = &cm->seq_params->order_hint_info;
   SkipModeInfo *const skip_mode_info = &cm->current_frame.skip_mode_info;
 
   skip_mode_info->skip_mode_allowed = 0;
@@ -1323,11 +1341,11 @@
   int lst_frame_sort_idx = -1;
   int gld_frame_sort_idx = -1;
 
-  assert(cm->seq_params.order_hint_info.enable_order_hint);
-  assert(cm->seq_params.order_hint_info.order_hint_bits_minus_1 >= 0);
+  assert(cm->seq_params->order_hint_info.enable_order_hint);
+  assert(cm->seq_params->order_hint_info.order_hint_bits_minus_1 >= 0);
   const int cur_order_hint = (int)cm->current_frame.order_hint;
   const int cur_frame_sort_idx =
-      1 << cm->seq_params.order_hint_info.order_hint_bits_minus_1;
+      1 << cm->seq_params->order_hint_info.order_hint_bits_minus_1;
 
   REF_FRAME_INFO ref_frame_info[REF_FRAMES];
   int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
@@ -1349,7 +1367,7 @@
     ref_frame_info[i].sort_idx =
         (offset == -1) ? -1
                        : cur_frame_sort_idx +
-                             get_relative_dist(&cm->seq_params.order_hint_info,
+                             get_relative_dist(&cm->seq_params->order_hint_info,
                                                offset, cur_order_hint);
     assert(ref_frame_info[i].sort_idx >= -1);
 
@@ -1360,11 +1378,11 @@
   // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
   // frames.
   if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as LAST");
   }
   if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Inter frame requests a look-ahead frame as GOLDEN");
   }
 

diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index 05a0dbc..3ab784c 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h

@@ -201,6 +201,7 @@
 void av1_setup_frame_buf_refs(AV1_COMMON *cm);
 void av1_setup_frame_sign_bias(AV1_COMMON *cm);
 void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_calculate_ref_frame_side(AV1_COMMON *cm);
 void av1_setup_motion_field(AV1_COMMON *cm);
 void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
                         int lst_map_idx, int gld_map_idx);

diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index 12bcce8..3db9dd6 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h

@@ -107,9 +107,9 @@
   if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
   if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int fwd = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int fwd = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   fwd_frame_index, cur_frame_index));
-  int bck = abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int bck = abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                   cur_frame_index, bck_frame_index));
 
   const MB_MODE_INFO *const above_mi = xd->above_mbmi;

diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index ad155b2..2f0850b 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c

@@ -104,8 +104,13 @@
 
   if (av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]], 0,
                      inter_pred_params->scale_factors,
-                     &inter_pred_params->warp_params))
+                     &inter_pred_params->warp_params)) {
+#if CONFIG_REALTIME_ONLY
+    aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_FEATURE,
+                       "Warped motion is disabled in realtime only build.");
+#endif
     inter_pred_params->mode = WARP_PRED;
+  }
 }
 
 void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -154,6 +159,9 @@
         inter_pred_params->subsampling_y, &inter_pred_params->conv_params);
   }
 #endif
+  else {
+    assert(0 && "Unsupported inter_pred_params->mode");
+  }
 }
 
 static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
@@ -713,8 +721,8 @@
 }
 
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
                                      int *use_dist_wtd_comp_avg,
                                      int is_compound) {
   assert(fwd_offset != NULL && bck_offset != NULL);
@@ -734,18 +742,18 @@
   if (bck_buf != NULL) bck_frame_index = bck_buf->order_hint;
   if (fwd_buf != NULL) fwd_frame_index = fwd_buf->order_hint;
 
-  int d0 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int d0 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                        fwd_frame_index, cur_frame_index)),
                  0, MAX_FRAME_DISTANCE);
-  int d1 = clamp(abs(get_relative_dist(&cm->seq_params.order_hint_info,
+  int d1 = clamp(abs(get_relative_dist(&cm->seq_params->order_hint_info,
                                        cur_frame_index, bck_frame_index)),
                  0, MAX_FRAME_DISTANCE);
 
   const int order = d0 <= d1;
 
   if (d0 == 0 || d1 == 0) {
-    *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
-    *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+    *fwd_offset = quant_dist_lookup_table[3][order];
+    *bck_offset = quant_dist_lookup_table[3][1 - order];
     return;
   }
 
@@ -758,8 +766,8 @@
     if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
   }
 
-  *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
-  *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+  *fwd_offset = quant_dist_lookup_table[i][order];
+  *bck_offset = quant_dist_lookup_table[i][1 - order];
 }
 
 // True if the following hold:
@@ -911,7 +919,7 @@
         ref, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
 
     av1_dist_wtd_comp_weight_assign(
-        cm, mi, 0, &inter_pred_params.conv_params.fwd_offset,
+        cm, mi, &inter_pred_params.conv_params.fwd_offset,
         &inter_pred_params.conv_params.bck_offset,
         &inter_pred_params.conv_params.use_dist_wtd_comp_avg, is_compound);
 
@@ -1189,7 +1197,6 @@
 
 void av1_setup_obmc_dst_bufs(MACROBLOCKD *xd, uint8_t **dst_buf1,
                              uint8_t **dst_buf2) {
-#if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     int len = sizeof(uint16_t);
     dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
@@ -1203,16 +1210,13 @@
     dst_buf2[2] =
         CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
   } else {
-#endif  // CONFIG_AV1_HIGHBITDEPTH
     dst_buf1[0] = xd->tmp_obmc_bufs[0];
     dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
     dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
     dst_buf2[0] = xd->tmp_obmc_bufs[1];
     dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
     dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
-#if CONFIG_AV1_HIGHBITDEPTH
   }
-#endif  // CONFIG_AV1_HIGHBITDEPTH
 }
 
 void av1_setup_build_prediction_by_above_pred(
@@ -1363,10 +1367,12 @@
   assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
   assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
   assert(xd->mi[0]->use_intrabc == 0);
+  const SequenceHeader *seq_params = cm->seq_params;
 
-  av1_predict_intra_block(cm, xd, pd->width, pd->height,
-                          max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
-                          FILTER_INTRA_MODES, ctx->plane[plane],
+  av1_predict_intra_block(xd, seq_params->sb_size,
+                          seq_params->enable_intra_edge_filter, pd->width,
+                          pd->height, max_txsize_rect_lookup[plane_bsize], mode,
+                          0, 0, FILTER_INTRA_MODES, ctx->plane[plane],
                           ctx->stride[plane], dst, dst_stride, 0, 0, plane);
 }
 

diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index c869616..056dc67 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h

@@ -368,8 +368,8 @@
 }
 
 void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
-                                     const MB_MODE_INFO *mbmi, int order_idx,
-                                     int *fwd_offset, int *bck_offset,
+                                     const MB_MODE_INFO *mbmi, int *fwd_offset,
+                                     int *bck_offset,
                                      int *use_dist_wtd_comp_avg,
                                      int is_compound);
 

diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 0c01f92..9832d4f 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c

@@ -19,7 +19,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_once.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
 #include "av1/common/reconintra.h"
@@ -193,7 +192,7 @@
   return ret;
 }
 
-static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_top_right(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int top_available, int right_available,
                          PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                          int col_off, int ss_x, int ss_y) {
@@ -223,7 +222,7 @@
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -378,7 +377,7 @@
   return ret;
 }
 
-static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+static int has_bottom_left(BLOCK_SIZE sb_size, BLOCK_SIZE bsize, int mi_row,
                            int mi_col, int bottom_available, int left_available,
                            PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
                            int col_off, int ss_x, int ss_y) {
@@ -415,7 +414,7 @@
 
     const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
     const int bh_in_mi_log2 = mi_size_high_log2[bsize];
-    const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+    const int sb_mi_size = mi_size_high[sb_size];
     const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
     const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
 
@@ -971,7 +970,7 @@
   }
 }
 
-static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) {
   int ab_sm, le_sm;
 
   if (plane == 0) {
@@ -1144,11 +1143,11 @@
 }
 #if CONFIG_AV1_HIGHBITDEPTH
 static void build_intra_predictors_high(
-    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
-    int dst_stride, PREDICTION_MODE mode, int angle_delta,
-    FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
-    int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
-    int n_bottomleft_px, int plane) {
+    const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride,
+    PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type,
+    int bit_depth) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -1166,7 +1165,7 @@
   int p_angle = 0;
   const int is_dr_mode = av1_is_directional_mode(mode);
   const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
-  int base = 128 << (xd->bd - 8);
+  int base = 128 << (bit_depth - 8);
   // The left_data, above_data buffers must be zeroed to fix some intermittent
   // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4
   // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are
@@ -1270,7 +1269,7 @@
 
   if (use_filter_intra) {
     highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                                  filter_intra_mode, xd->bd);
+                                  filter_intra_mode, bit_depth);
     return;
   }
 
@@ -1280,61 +1279,57 @@
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner_high(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
-        av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+        av1_upsample_intra_edge_high(above_row, n_px, bit_depth);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
-        av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+        av1_upsample_intra_edge_high(left_col, n_px, bit_depth);
       }
     }
     highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
-                        upsample_above, upsample_left, p_angle, xd->bd);
+                        upsample_above, upsample_left, p_angle, bit_depth);
     return;
   }
 
   // predict
   if (mode == DC_PRED) {
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
-        dst, dst_stride, above_row, left_col, xd->bd);
+        dst, dst_stride, above_row, left_col, bit_depth);
   } else {
-    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+    pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, bit_depth);
   }
 }
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
-static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
-                                   int ref_stride, uint8_t *dst, int dst_stride,
-                                   PREDICTION_MODE mode, int angle_delta,
-                                   FILTER_INTRA_MODE filter_intra_mode,
-                                   TX_SIZE tx_size, int disable_edge_filter,
-                                   int n_top_px, int n_topright_px,
-                                   int n_left_px, int n_bottomleft_px,
-                                   int plane) {
+static void build_intra_predictors(
+    const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride,
+    PREDICTION_MODE mode, int angle_delta, FILTER_INTRA_MODE filter_intra_mode,
+    TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px,
+    int n_left_px, int n_bottomleft_px, int intra_edge_filter_type) {
   int i;
   const uint8_t *above_ref = ref - ref_stride;
   const uint8_t *left_ref = ref - 1;
@@ -1462,33 +1457,32 @@
     if (!disable_edge_filter) {
       const int need_right = p_angle < 90;
       const int need_bottom = p_angle > 180;
-      const int filt_type = get_filt_type(xd, plane);
       if (p_angle != 90 && p_angle != 180) {
         const int ab_le = need_above_left ? 1 : 0;
         if (need_above && need_left && (txwpx + txhpx >= 24)) {
           filter_intra_edge_corner(above_row, left_col);
         }
         if (need_above && n_top_px > 0) {
-          const int strength =
-              intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+          const int strength = intra_edge_filter_strength(
+              txwpx, txhpx, p_angle - 90, intra_edge_filter_type);
           const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
           av1_filter_intra_edge(above_row - ab_le, n_px, strength);
         }
         if (need_left && n_left_px > 0) {
           const int strength = intra_edge_filter_strength(
-              txhpx, txwpx, p_angle - 180, filt_type);
+              txhpx, txwpx, p_angle - 180, intra_edge_filter_type);
           const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
           av1_filter_intra_edge(left_col - ab_le, n_px, strength);
         }
       }
-      upsample_above =
-          av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+      upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90,
+                                                   intra_edge_filter_type);
       if (need_above && upsample_above) {
         const int n_px = txwpx + (need_right ? txhpx : 0);
         av1_upsample_intra_edge(above_row, n_px);
       }
-      upsample_left =
-          av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+      upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180,
+                                                  intra_edge_filter_type);
       if (need_left && upsample_left) {
         const int n_px = txhpx + (need_bottom ? txwpx : 0);
         av1_upsample_intra_edge(left_col, n_px);
@@ -1559,11 +1553,14 @@
   return bs;
 }
 
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane) {
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const int txwpx = tx_size_wide[tx_size];
   const int txhpx = tx_size_high[tx_size];
@@ -1626,32 +1623,32 @@
   }
 
   const int have_top_right =
-      has_top_right(cm, bsize, mi_row, mi_col, have_top, right_available,
+      has_top_right(sb_size, bsize, mi_row, mi_col, have_top, right_available,
                     partition, tx_size, row_off, col_off, ss_x, ss_y);
-  const int have_bottom_left =
-      has_bottom_left(cm, bsize, mi_row, mi_col, bottom_available, have_left,
-                      partition, tx_size, row_off, col_off, ss_x, ss_y);
+  const int have_bottom_left = has_bottom_left(
+      sb_size, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+      tx_size, row_off, col_off, ss_x, ss_y);
 
-  const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+  const int disable_edge_filter = !enable_intra_edge_filter;
+  const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane);
 #if CONFIG_AV1_HIGHBITDEPTH
   if (is_cur_buf_hbd(xd)) {
     build_intra_predictors_high(
-        xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
-        filter_intra_mode, tx_size, disable_edge_filter,
-        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode,
+        tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
         have_top_right ? AOMMIN(txwpx, xr) : 0,
         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-        have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+        have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type,
+        xd->bd);
     return;
   }
 #endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
-                         angle_delta, filter_intra_mode, tx_size,
-                         disable_edge_filter,
-                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
-                         have_top_right ? AOMMIN(txwpx, xr) : 0,
-                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
-                         have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+  build_intra_predictors(
+      ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode,
+      tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+      have_top_right ? AOMMIN(txwpx, xr) : 0,
+      have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+      have_bottom_left ? AOMMIN(txhpx, yd) : 0, intra_edge_filter_type);
 }
 
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
@@ -1669,6 +1666,7 @@
           ? mbmi->filter_intra_mode_info.filter_intra_mode
           : FILTER_INTRA_MODES;
   const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+  const SequenceHeader *seq_params = cm->seq_params;
 
   if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
 #if CONFIG_DEBUG
@@ -1687,10 +1685,11 @@
     CFL_CTX *const cfl = &xd->cfl;
     CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
     if (cfl->dc_pred_is_cached[pred_plane] == 0) {
-      av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                              angle_delta, use_palette, filter_intra_mode, dst,
-                              dst_stride, dst, dst_stride, blk_col, blk_row,
-                              plane);
+      av1_predict_intra_block(xd, seq_params->sb_size,
+                              seq_params->enable_intra_edge_filter, pd->width,
+                              pd->height, tx_size, mode, angle_delta,
+                              use_palette, filter_intra_mode, dst, dst_stride,
+                              dst, dst_stride, blk_col, blk_row, plane);
       if (cfl->use_dc_pred_cache) {
         cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
         cfl->dc_pred_is_cached[pred_plane] = 1;
@@ -1701,9 +1700,10 @@
     cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
     return;
   }
-  av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
-                          angle_delta, use_palette, filter_intra_mode, dst,
-                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mode, angle_delta, use_palette, filter_intra_mode,
+      dst, dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 }
 
 void av1_init_intra_predictors(void) {

diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index 907db5d..fa66ccd 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h

@@ -26,11 +26,14 @@
 void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                     int plane, int blk_col, int blk_row,
                                     TX_SIZE tx_size);
-void av1_predict_intra_block(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
-    TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
-    FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
-    uint8_t *dst, int dst_stride, int col_off, int row_off, int plane);
+void av1_predict_intra_block(const MACROBLOCKD *xd, BLOCK_SIZE sb_size,
+                             int enable_intra_edge_filter, int wpx, int hpx,
+                             TX_SIZE tx_size, PREDICTION_MODE mode,
+                             int angle_delta, int use_palette,
+                             FILTER_INTRA_MODE filter_intra_mode,
+                             const uint8_t *ref, int ref_stride, uint8_t *dst,
+                             int dst_stride, int col_off, int row_off,
+                             int plane);
 
 // Mapping of interintra to intra mode for use in the intra component
 static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
@@ -64,7 +67,7 @@
 
 static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
                                                  BLOCK_SIZE bs) {
-  if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+  if (!cm->seq_params->enable_filter_intra || bs == BLOCK_INVALID) return 0;
 
   return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
 }

diff --git a/av1/common/resize.c b/av1/common/resize.c
index 0cfb5a2..112a08a 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c

@@ -1263,7 +1263,7 @@
                                 int src_stride, uint8_t *dst, int dst_stride,
                                 int plane, int rows) {
   const int is_uv = (plane > 0);
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
   const int upscaled_plane_width =
       ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
@@ -1305,11 +1305,11 @@
     const int pad_right = (j == cm->tiles.cols - 1);
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth)
+    if (cm->seq_params->use_highbitdepth)
       highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
                                     dst_ptr, rows, dst_width, dst_stride,
                                     x_step_qn, x0_qn, pad_left, pad_right,
-                                    cm->seq_params.bit_depth);
+                                    cm->seq_params->bit_depth);
     else
       upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
                              rows, dst_width, dst_stride, x_step_qn, x0_qn,
@@ -1354,18 +1354,18 @@
   if (scaling_required) {
     const int num_planes = av1_num_planes(cm);
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8) {
+    if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
       av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
     }
 #else
     if (use_optimized_scaler) {
       av1_resize_and_extend_frame(unscaled, scaled, filter, phase, num_planes);
     } else {
       av1_resize_and_extend_frame_nonnormative(
-          unscaled, scaled, (int)cm->seq_params.bit_depth, num_planes);
+          unscaled, scaled, (int)cm->seq_params->bit_depth, num_planes);
     }
 #endif
     return scaled;
@@ -1432,7 +1432,7 @@
 void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
   const int num_planes = av1_num_planes(cm);
   if (!av1_superres_scaled(cm)) return;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
 
   YV12_BUFFER_CONFIG copy_buffer;
@@ -1445,7 +1445,7 @@
           &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           AOM_BORDER_IN_PIXELS, byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate copy buffer for superres upscaling");
 
   // Copy function assumes the frames are the same size.
@@ -1468,7 +1468,7 @@
     if (release_fb_cb(cb_priv, fb)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to free current frame buffer before superres upscaling");
     }
     // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
@@ -1479,7 +1479,7 @@
             AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, 0)) {
       unlock_buffer_pool(pool);
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to allocate current frame buffer for superres upscaling");
     }
     unlock_buffer_pool(pool);
@@ -1495,7 +1495,7 @@
             seq_params->subsampling_y, seq_params->use_highbitdepth,
             AOM_BORDER_IN_PIXELS, byte_alignment))
       aom_internal_error(
-          &cm->error, AOM_CODEC_MEM_ERROR,
+          cm->error, AOM_CODEC_MEM_ERROR,
           "Failed to reallocate current frame buffer for superres upscaling");
 
     // Restore config data back to frame_to_show

diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 41d0e22..202953c 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c

@@ -42,8 +42,8 @@
 AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
   AV1PixelRect rect;
 
-  int ss_x = is_uv && cm->seq_params.subsampling_x;
-  int ss_y = is_uv && cm->seq_params.subsampling_y;
+  int ss_x = is_uv && cm->seq_params->subsampling_x;
+  int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   rect.top = 0;
   rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
@@ -1107,7 +1107,7 @@
                                             YV12_BUFFER_CONFIG *frame,
                                             AV1_COMMON *cm, int optimized_lr,
                                             int num_planes) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bit_depth = seq_params->bit_depth;
   const int highbd = seq_params->use_highbitdepth;
   lr_ctxt->dst = &cm->rst_frame;
@@ -1118,7 +1118,7 @@
           lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
           seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
           cm->features.byte_alignment, NULL, NULL, NULL, 0) < 0)
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate restoration dst buffer");
 
   lr_ctxt->on_rest_unit = filter_frame_on_unit;
@@ -1299,7 +1299,7 @@
                                     int32_t *tmpbuf,
                                     RestorationLineBuffers *rlbs) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   const RestorationInfo *rsi = &cm->rst_info[plane];
 
@@ -1315,7 +1315,7 @@
                                        int *rrow1) {
   assert(rcol0 && rcol1 && rrow0 && rrow1);
 
-  if (bsize != cm->seq_params.sb_size) return 0;
+  if (bsize != cm->seq_params->sb_size) return 0;
   if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
 
   assert(!cm->features.all_lossless);
@@ -1345,8 +1345,8 @@
   const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
 
   // The size of an MI-unit on this plane of the image
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int mi_size_x = MI_SIZE >> ss_x;
   const int mi_size_y = MI_SIZE >> ss_y;
 
@@ -1427,7 +1427,7 @@
   int upscaled_width;
   int line_bytes;
   if (av1_superres_scaled(cm)) {
-    const int ss_x = is_uv && cm->seq_params.subsampling_x;
+    const int ss_x = is_uv && cm->seq_params->subsampling_x;
     upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
     line_bytes = upscaled_width << use_highbd;
     if (use_highbd)
@@ -1474,7 +1474,7 @@
   // At the point where this function is called, we've already applied
   // superres. So we don't need to extend the lines here, we can just
   // pull directly from the topmost row of the upscaled frame.
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
   const int upscaled_width = av1_superres_scaled(cm)
                                  ? (cm->superres_upscaled_width + ss_x) >> ss_x
                                  : src_width;
@@ -1494,7 +1494,7 @@
                                          int use_highbd, int plane,
                                          AV1_COMMON *cm, int after_cdef) {
   const int is_uv = plane > 0;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
   const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
 
@@ -1559,7 +1559,7 @@
 void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
                                               AV1_COMMON *cm, int after_cdef) {
   const int num_planes = av1_num_planes(cm);
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   for (int p = 0; p < num_planes; ++p) {
     save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
   }

diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 638dc4c..518a15a 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c

@@ -54,8 +54,8 @@
 #endif
 
 // Allocate memory for lf row synchronization
-static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
-                              int width, int num_workers) {
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                           int width, int num_workers) {
   lf_sync->rows = rows;
 #if CONFIG_MULTITHREAD
   {
@@ -152,6 +152,61 @@
   }
 }
 
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers) {
+  if (num_workers < 1) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ == NULL) {
+    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
+    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+  }
+#else
+  (void)cm;
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync) {
+  if (cdef_sync == NULL) return;
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_ != NULL) {
+    pthread_mutex_destroy(cdef_sync->mutex_);
+    aom_free(cdef_sync->mutex_);
+  }
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_read(AV1CdefSync *const cdef_sync,
+                                         int row) {
+  if (!row) return;
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row - 1].row_mutex_);
+  while (cdef_row_mt[row - 1].is_row_done != 1)
+    pthread_cond_wait(cdef_row_mt[row - 1].row_cond_,
+                      cdef_row_mt[row - 1].row_mutex_);
+  cdef_row_mt[row - 1].is_row_done = 0;
+  pthread_mutex_unlock(cdef_row_mt[row - 1].row_mutex_);
+#else
+  (void)cdef_sync;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void cdef_row_mt_sync_write(AV1CdefSync *const cdef_sync,
+                                          int row) {
+#if CONFIG_MULTITHREAD
+  AV1CdefRowSync *const cdef_row_mt = cdef_sync->cdef_row_mt;
+  pthread_mutex_lock(cdef_row_mt[row].row_mutex_);
+  pthread_cond_signal(cdef_row_mt[row].row_cond_);
+  cdef_row_mt[row].is_row_done = 1;
+  pthread_mutex_unlock(cdef_row_mt[row].row_mutex_);
+#else
+  (void)cdef_sync;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
+}
+
 static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
                              int plane) {
 #if CONFIG_MULTITHREAD
@@ -206,38 +261,24 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
-static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
-                            int stop,
-#if CONFIG_LPF_MASK
-                            int is_decoding,
-#endif
-                            int plane_start, int plane_end) {
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, int start, int stop,
+                            const int planes_to_lf[3], int is_realtime) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
   lf_sync->jobs_dequeued = 0;
 
-  for (dir = 0; dir < 2; dir++) {
-    for (plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-#if CONFIG_LPF_MASK
-      int step = MAX_MIB_SIZE;
-      if (is_decoding) {
-        step = MI_SIZE_64X64;
-      }
-      for (mi_row = start; mi_row < stop; mi_row += step)
-#else
-      for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE)
-#endif
-      {
+  // Launch all vertical jobs first, as they are blocking the horizontal ones.
+  // Launch top row jobs for all planes first, in case the output can be
+  // partially reconstructed row by row.
+  for (dir = 0; dir < 2; ++dir) {
+    for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+      for (plane = 0; plane < 3; ++plane) {
+        if (!planes_to_lf[plane]) continue;
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
+        lf_job_queue->is_realtime = is_realtime;
         lf_job_queue++;
         lf_sync->jobs_enqueued++;
       }
@@ -264,59 +305,67 @@
   return cur_job_info;
 }
 
-// Implement row loopfiltering for each thread.
+// One job of row loopfiltering.
 static INLINE void thread_loop_filter_rows(
     const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane *planes, MACROBLOCKD *xd,
-    AV1LfSync *const lf_sync) {
+    struct macroblockd_plane *planes, MACROBLOCKD *xd, int mi_row, int plane,
+    int dir, int is_realtime, AV1LfSync *const lf_sync) {
   const int sb_cols =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
       MAX_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
-  int r, c;
+  const int r = mi_row >> MAX_MIB_SIZE_LOG2;
+  int mi_col, c;
 
-  while (1) {
-    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+  if (dir == 0) {
+    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+      c = mi_col >> MAX_MIB_SIZE_LOG2;
 
-    if (cur_job_info != NULL) {
-      mi_row = cur_job_info->mi_row;
-      plane = cur_job_info->plane;
-      dir = cur_job_info->dir;
-      r = mi_row >> MAX_MIB_SIZE_LOG2;
+      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+                           mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+      (void)is_realtime;
+      av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                  mi_col);
+#else
+      if (is_realtime) {
+        av1_filter_block_plane_vert_rt(cm, xd, plane, &planes[plane], mi_row,
+                                       mi_col);
 
-      if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MAX_MIB_SIZE) {
-          c = mi_col >> MAX_MIB_SIZE_LOG2;
-
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
-
-          av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
-                                      mi_col);
-          sync_write(lf_sync, r, c, sb_cols, plane);
-        }
-      } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MAX_MIB_SIZE) {
-          c = mi_col >> MAX_MIB_SIZE_LOG2;
-
-          // Wait for vertical edge filtering of the top-right block to be
-          // completed
-          sync_read(lf_sync, r, c, plane);
-
-          // Wait for vertical edge filtering of the right block to be
-          // completed
-          sync_read(lf_sync, r + 1, c, plane);
-
-          av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
-                               mi_row, mi_col, plane, plane + 1);
-          av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
-                                      mi_col);
-        }
+      } else {
+        av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                    mi_col);
       }
-    } else {
-      break;
+#endif
+      if (lf_sync != NULL) sync_write(lf_sync, r, c, sb_cols, plane);
+    }
+  } else if (dir == 1) {
+    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += MAX_MIB_SIZE) {
+      c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+      if (lf_sync != NULL) {
+        // Wait for vertical edge filtering of the top-right block to be
+        // completed
+        sync_read(lf_sync, r, c, plane);
+
+        // Wait for vertical edge filtering of the right block to be completed
+        sync_read(lf_sync, r + 1, c, plane);
+      }
+
+      av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
+                           mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+      (void)is_realtime;
+      av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                  mi_col);
+#else
+      if (is_realtime) {
+        av1_filter_block_plane_horz_rt(cm, xd, plane, &planes[plane], mi_row,
+                                       mi_col);
+      } else {
+        av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                    mi_col);
+      }
+#endif
     }
   }
 }
@@ -325,110 +374,33 @@
 static int loop_filter_row_worker(void *arg1, void *arg2) {
   AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
   LFWorkerData *const lf_data = (LFWorkerData *)arg2;
-  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                          lf_data->xd, lf_sync);
-  return 1;
-}
-
-#if CONFIG_LPF_MASK
-static INLINE void thread_loop_filter_bitmask_rows(
-    const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
-    struct macroblockd_plane *planes, MACROBLOCKD *xd,
-    AV1LfSync *const lf_sync) {
-  const int sb_cols =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MIN_MIB_SIZE_LOG2) >>
-      MIN_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
-  int r, c;
-  (void)xd;
-
-  while (1) {
-    AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
-
-    if (cur_job_info != NULL) {
-      mi_row = cur_job_info->mi_row;
-      plane = cur_job_info->plane;
-      dir = cur_job_info->dir;
-      r = mi_row >> MIN_MIB_SIZE_LOG2;
-
-      if (dir == 0) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MI_SIZE_64X64) {
-          c = mi_col >> MIN_MIB_SIZE_LOG2;
-
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-
-          av1_filter_block_plane_bitmask_vert(cm, &planes[plane], plane, mi_row,
-                                              mi_col);
-          sync_write(lf_sync, r, c, sb_cols, plane);
-        }
-      } else if (dir == 1) {
-        for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
-             mi_col += MI_SIZE_64X64) {
-          c = mi_col >> MIN_MIB_SIZE_LOG2;
-
-          // Wait for vertical edge filtering of the top-right block to be
-          // completed
-          sync_read(lf_sync, r, c, plane);
-
-          // Wait for vertical edge filtering of the right block to be
-          // completed
-          sync_read(lf_sync, r + 1, c, plane);
-
-          av1_setup_dst_planes(planes, BLOCK_64X64, frame_buffer, mi_row,
-                               mi_col, plane, plane + 1);
-          av1_filter_block_plane_bitmask_horz(cm, &planes[plane], plane, mi_row,
-                                              mi_col);
-        }
-      }
-    } else {
-      break;
-    }
+  AV1LfMTInfo *cur_job_info;
+  while ((cur_job_info = get_lf_job_info(lf_sync)) != NULL) {
+    const int is_realtime = cur_job_info->is_realtime && !cur_job_info->plane;
+    thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                            lf_data->xd, cur_job_info->mi_row,
+                            cur_job_info->plane, cur_job_info->dir, is_realtime,
+                            lf_sync);
   }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_bitmask_row_worker(void *arg1, void *arg2) {
-  AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
-  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
-  thread_loop_filter_bitmask_rows(lf_data->frame_buffer, lf_data->cm,
-                                  lf_data->planes, lf_data->xd, lf_sync);
   return 1;
 }
-#endif  // CONFIG_LPF_MASK
 
 static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                 MACROBLOCKD *xd, int start, int stop,
-                                int plane_start, int plane_end,
-#if CONFIG_LPF_MASK
-                                int is_decoding,
-#endif
-                                AVxWorker *workers, int nworkers,
-                                AV1LfSync *lf_sync) {
+                                const int planes_to_lf[3], AVxWorker *workers,
+                                int num_workers, AV1LfSync *lf_sync,
+                                int is_realtime) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-#if CONFIG_LPF_MASK
-  int sb_rows;
-  if (is_decoding) {
-    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MIN_MIB_SIZE_LOG2) >>
-              MIN_MIB_SIZE_LOG2;
-  } else {
-    sb_rows = ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
-              MAX_MIB_SIZE_LOG2;
-  }
-#else
   // Number of superblock rows and cols
   const int sb_rows =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, MAX_MIB_SIZE_LOG2) >>
       MAX_MIB_SIZE_LOG2;
-#endif
-  const int num_workers = nworkers;
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     av1_loop_filter_dealloc(lf_sync);
-    loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
 
   // Initialize cur_sb_col to -1 for all SB rows.
@@ -437,26 +409,14 @@
            sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
   }
 
-  enqueue_lf_jobs(lf_sync, cm, start, stop,
-#if CONFIG_LPF_MASK
-                  is_decoding,
-#endif
-                  plane_start, plane_end);
+  enqueue_lf_jobs(lf_sync, start, stop, planes_to_lf, is_realtime);
 
   // Set up loopfilter thread data.
   for (i = num_workers - 1; i >= 0; --i) {
     AVxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-#if CONFIG_LPF_MASK
-    if (is_decoding) {
-      worker->hook = loop_filter_bitmask_row_worker;
-    } else {
-      worker->hook = loop_filter_row_worker;
-    }
-#else
     worker->hook = loop_filter_row_worker;
-#endif
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
@@ -472,20 +432,45 @@
   }
 
   // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
+  for (i = 1; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
 }
 
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                             MACROBLOCKD *xd, int start, int stop,
+                             const int planes_to_lf[3], int is_realtime) {
+  // Filter top rows of all planes first, in case the output can be partially
+  // reconstructed row by row.
+  int mi_row, plane, dir;
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    for (plane = 0; plane < 3; ++plane) {
+      if (!planes_to_lf[plane]) continue;
+      for (dir = 0; dir < 2; ++dir) {
+        thread_loop_filter_rows(frame, cm, xd->plane, xd, mi_row, plane, dir,
+                                is_realtime && !plane, /*lf_sync=*/NULL);
+      }
+    }
+  }
+}
+
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                               MACROBLOCKD *xd, int plane_start, int plane_end,
-                              int partial_frame,
-#if CONFIG_LPF_MASK
-                              int is_decoding,
-#endif
-                              AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync) {
+                              int partial_frame, AVxWorker *workers,
+                              int num_workers, AV1LfSync *lf_sync,
+                              int is_realtime) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
+  int planes_to_lf[3];
+
+  // For each luma and chroma plane, whether to filter it or not.
+  planes_to_lf[0] = (cm->lf.filter_level[0] || cm->lf.filter_level[1]) &&
+                    plane_start <= 0 && 0 < plane_end;
+  planes_to_lf[1] = cm->lf.filter_level_u && plane_start <= 1 && 1 < plane_end;
+  planes_to_lf[2] = cm->lf.filter_level_v && plane_start <= 2 && 2 < plane_end;
+  // If the luma plane is purposely not filtered, neither are the chroma planes.
+  if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return;
+  // Early exit.
+  if (!planes_to_lf[0] && !planes_to_lf[1] && !planes_to_lf[2]) return;
 
   start_mi_row = 0;
   mi_rows_to_filter = cm->mi_params.mi_rows;
@@ -497,37 +482,15 @@
   end_mi_row = start_mi_row + mi_rows_to_filter;
   av1_loop_filter_frame_init(cm, plane_start, plane_end);
 
-#if CONFIG_LPF_MASK
-  if (is_decoding) {
-    cm->is_decoding = is_decoding;
-    // TODO(chengchen): currently use one thread to build bitmasks for the
-    // frame. Make it support multi-thread later.
-    for (int plane = plane_start; plane < plane_end; plane++) {
-      if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
-        break;
-      else if (plane == 1 && !(cm->lf.filter_level_u))
-        continue;
-      else if (plane == 2 && !(cm->lf.filter_level_v))
-        continue;
-
-      // TODO(chengchen): can we remove this?
-      struct macroblockd_plane *pd = xd->plane;
-      av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame, 0, 0, plane,
-                           plane + 1);
-
-      av1_build_bitmask_vert_info(cm, &pd[plane], plane);
-      av1_build_bitmask_horz_info(cm, &pd[plane], plane);
-    }
-    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                        plane_end, 1, workers, num_workers, lf_sync);
+  if (num_workers > 1) {
+    // Enqueue and execute loopfiltering jobs.
+    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+                        workers, num_workers, lf_sync, is_realtime);
   } else {
-    loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                        plane_end, 0, workers, num_workers, lf_sync);
+    // Directly filter in the main thread.
+    loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row, planes_to_lf,
+                     is_realtime);
   }
-#else
-  loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                      plane_end, workers, num_workers, lf_sync);
-#endif
 }
 
 #if !CONFIG_REALTIME_ONLY
@@ -587,9 +550,9 @@
 }
 
 // Allocate memory for loop restoration row synchronization
-static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
-                                   int num_workers, int num_rows_lr,
-                                   int num_planes, int width) {
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                int num_workers, int num_rows_lr,
+                                int num_planes, int width) {
   lr_sync->rows = num_rows_lr;
   lr_sync->num_planes = num_planes;
 #if CONFIG_MULTITHREAD
@@ -720,7 +683,7 @@
   for (int plane = 0; plane < num_planes; plane++) {
     if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
     const int is_uv = plane > 0;
-    const int ss_y = is_uv && cm->seq_params.subsampling_y;
+    const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
     AV1PixelRect tile_rect = ctxt[plane].tile_rect;
     const int unit_size = ctxt[plane].rsi->restoration_unit_size;
@@ -878,11 +841,11 @@
   int i;
   assert(MAX_MB_PLANE == 3);
 
-  if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
-      num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+  if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+      num_workers > lr_sync->num_workers || num_planes > lr_sync->num_planes) {
     av1_loop_restoration_dealloc(lr_sync, num_workers);
-    loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
-                           cm->width);
+    av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr,
+                               num_planes, cm->width);
   }
 
   // Initialize cur_sb_col to -1 for all SB rows.
@@ -910,7 +873,7 @@
   }
 
   // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
+  for (i = 1; i < num_workers; ++i) {
     winterface->sync(&workers[i]);
   }
 }
@@ -932,3 +895,198 @@
                                  cm);
 }
 #endif
+
+// Initializes cdef_sync parameters.
+static AOM_INLINE void reset_cdef_job_info(AV1CdefSync *const cdef_sync) {
+  cdef_sync->end_of_frame = 0;
+  cdef_sync->fbr = 0;
+  cdef_sync->fbc = 0;
+}
+
+static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers,
+                                           int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+static AOM_INLINE void sync_cdef_workers(AVxWorker *const workers,
+                                         AV1_COMMON *const cm,
+                                         int num_workers) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int had_error = 0;
+
+  // Wait for completion of Cdef frame.
+  for (int i = num_workers - 1; i > 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    had_error |= !winterface->sync(worker);
+  }
+  if (had_error)
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
+                       "Failed to process cdef frame");
+}
+
+// Updates the row index of the next job to be processed.
+// Also updates end_of_frame flag when the processing of all rows is complete.
+static void update_cdef_row_next_job_info(AV1CdefSync *const cdef_sync,
+                                          const int nvfb) {
+  cdef_sync->fbr++;
+  if (cdef_sync->fbr == nvfb) {
+    cdef_sync->end_of_frame = 1;
+  }
+}
+
+// Checks if a job is available. If job is available,
+// populates next job information and returns 1, else returns 0.
+static AOM_INLINE int get_cdef_row_next_job(AV1CdefSync *const cdef_sync,
+                                            int *cur_fbr, const int nvfb) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  int do_next_row = 0;
+  // Populates information needed for current job and update the row
+  // index of the next row to be processed.
+  if (cdef_sync->end_of_frame == 0) {
+    do_next_row = 1;
+    *cur_fbr = cdef_sync->fbr;
+    update_cdef_row_next_job_info(cdef_sync, nvfb);
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(cdef_sync->mutex_);
+#endif  // CONFIG_MULTITHREAD
+  return do_next_row;
+}
+
+// Hook function for each thread in CDEF multi-threading.
+static int cdef_sb_row_worker_hook(void *arg1, void *arg2) {
+  AV1CdefSync *const cdef_sync = (AV1CdefSync *)arg1;
+  AV1CdefWorkerData *const cdef_worker = (AV1CdefWorkerData *)arg2;
+  const int nvfb =
+      (cdef_worker->cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  int cur_fbr;
+  while (get_cdef_row_next_job(cdef_sync, &cur_fbr, nvfb)) {
+    av1_cdef_fb_row(cdef_worker->cm, cdef_worker->xd, cdef_worker->linebuf,
+                    cdef_worker->colbuf, cdef_worker->srcbuf, cur_fbr,
+                    cdef_worker->cdef_init_fb_row_fn, cdef_sync);
+  }
+  return 1;
+}
+
+// Assigns CDEF hook function and thread data to each worker.
+static void prepare_cdef_frame_workers(
+    AV1_COMMON *const cm, MACROBLOCKD *xd, AV1CdefWorkerData *const cdef_worker,
+    AVxWorkerHook hook, AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+    int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  const int num_planes = av1_num_planes(cm);
+
+  cdef_worker[0].srcbuf = cm->cdef_info.srcbuf;
+  for (int plane = 0; plane < num_planes; plane++)
+    cdef_worker[0].colbuf[plane] = cm->cdef_info.colbuf[plane];
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = &workers[i];
+    cdef_worker[i].cm = cm;
+    cdef_worker[i].xd = xd;
+    cdef_worker[i].cdef_init_fb_row_fn = cdef_init_fb_row_fn;
+    for (int plane = 0; plane < num_planes; plane++)
+      cdef_worker[i].linebuf[plane] = cm->cdef_info.linebuf[plane];
+
+    worker->hook = hook;
+    worker->data1 = cdef_sync;
+    worker->data2 = &cdef_worker[i];
+  }
+}
+
+// Initializes row-level parameters for CDEF frame.
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr) {
+  const int num_planes = av1_num_planes(cm);
+  const int nvfb = (cm->mi_params.mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+  const int luma_stride =
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols << MI_SIZE_LOG2, 4);
+
+  // for the current filter block, it's top left corner mi structure (mi_tl)
+  // is first accessed to check whether the top and left boundaries are
+  // frame boundaries. Then bottom-left and top-right mi structures are
+  // accessed to check whether the bottom and right boundaries
+  // (respectively) are frame boundaries.
+  //
+  // Note that we can't just check the bottom-right mi structure - eg. if
+  // we're at the right-hand edge of the frame but not the bottom, then
+  // the bottom-right mi is NULL but the bottom-left is not.
+  fb_info->frame_boundary[TOP] = (MI_SIZE_64X64 * fbr == 0) ? 1 : 0;
+  if (fbr != nvfb - 1)
+    fb_info->frame_boundary[BOTTOM] =
+        (MI_SIZE_64X64 * (fbr + 1) == cm->mi_params.mi_rows) ? 1 : 0;
+  else
+    fb_info->frame_boundary[BOTTOM] = 1;
+
+  fb_info->src = src;
+  fb_info->damping = cm->cdef_info.cdef_damping;
+  fb_info->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
+  av1_zero(fb_info->dir);
+  av1_zero(fb_info->var);
+
+  for (int plane = 0; plane < num_planes; plane++) {
+    const int stride = luma_stride >> xd->plane[plane].subsampling_x;
+    uint16_t *top_linebuf = &linebuf[plane][0];
+    uint16_t *bot_linebuf = &linebuf[plane][nvfb * CDEF_VBORDER * stride];
+    {
+      const int mi_high_l2 = MI_SIZE_LOG2 - xd->plane[plane].subsampling_y;
+      const int top_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+      const int bot_offset = MI_SIZE_64X64 * (fbr + 1) << mi_high_l2;
+
+      if (fbr != nvfb - 1)  // if (fbr != 0)  // top line buffer copy
+        av1_cdef_copy_sb8_16(
+            cm, &top_linebuf[(fbr + 1) * CDEF_VBORDER * stride], stride,
+            xd->plane[plane].dst.buf, top_offset - CDEF_VBORDER, 0,
+            xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+      if (fbr != nvfb - 1)  // bottom line buffer copy
+        av1_cdef_copy_sb8_16(cm, &bot_linebuf[fbr * CDEF_VBORDER * stride],
+                             stride, xd->plane[plane].dst.buf, bot_offset, 0,
+                             xd->plane[plane].dst.stride, CDEF_VBORDER, stride);
+    }
+
+    fb_info->top_linebuf[plane] = &linebuf[plane][fbr * CDEF_VBORDER * stride];
+    fb_info->bot_linebuf[plane] =
+        &linebuf[plane]
+                [nvfb * CDEF_VBORDER * stride + (fbr * CDEF_VBORDER * stride)];
+  }
+
+  cdef_row_mt_sync_write(cdef_sync, fbr);
+  cdef_row_mt_sync_read(cdef_sync, fbr);
+}
+
+// Implements multi-threading for CDEF.
+// Perform CDEF on input frame.
+// Inputs:
+//   frame: Pointer to input frame buffer.
+//   cm: Pointer to common structure.
+//   xd: Pointer to common current coding block structure.
+// Returns:
+//   Nothing will be returned.
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers,
+                       cdef_init_fb_row_t cdef_init_fb_row_fn) {
+  YV12_BUFFER_CONFIG *frame = &cm->cur_frame->buf;
+  const int num_planes = av1_num_planes(cm);
+
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
+                       num_planes);
+
+  reset_cdef_job_info(cdef_sync);
+  prepare_cdef_frame_workers(cm, xd, cdef_worker, cdef_sb_row_worker_hook,
+                             workers, cdef_sync, num_workers,
+                             cdef_init_fb_row_fn);
+  launch_cdef_workers(workers, num_workers);
+  sync_cdef_workers(workers, cm, num_workers);
+}

diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 97b8abc..ab6e4df 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h

@@ -15,6 +15,7 @@
 #include "config/aom_config.h"
 
 #include "av1/common/av1_loopfilter.h"
+#include "av1/common/cdef.h"
 #include "aom_util/aom_thread.h"
 
 #ifdef __cplusplus
@@ -27,6 +28,7 @@
   int mi_row;
   int plane;
   int dir;
+  int is_realtime;
 } AV1LfMTInfo;
 
 // Loopfilter row synchronization
@@ -97,17 +99,65 @@
   int jobs_dequeued;
 } AV1LrSync;
 
+typedef struct AV1CdefWorker {
+  AV1_COMMON *cm;
+  MACROBLOCKD *xd;
+  uint16_t *colbuf[MAX_MB_PLANE];
+  uint16_t *srcbuf;
+  uint16_t *linebuf[MAX_MB_PLANE];
+  cdef_init_fb_row_t cdef_init_fb_row_fn;
+} AV1CdefWorkerData;
+
+typedef struct AV1CdefRowSync {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *row_mutex_;
+  pthread_cond_t *row_cond_;
+#endif  // CONFIG_MULTITHREAD
+  int is_row_done;
+} AV1CdefRowSync;
+
+// Data related to CDEF search multi-thread synchronization.
+typedef struct AV1CdefSyncData {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif  // CONFIG_MULTITHREAD
+  // Data related to CDEF row mt sync information
+  AV1CdefRowSync *cdef_row_mt;
+  // Flag to indicate all blocks are processed and end of frame is reached
+  int end_of_frame;
+  // Row index in units of 64x64 block
+  int fbr;
+  // Column index in units of 64x64 block
+  int fbc;
+} AV1CdefSync;
+
+void av1_cdef_frame_mt(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                       AV1CdefWorkerData *const cdef_worker,
+                       AVxWorker *const workers, AV1CdefSync *const cdef_sync,
+                       int num_workers, cdef_init_fb_row_t cdef_init_fb_row_fn);
+void av1_cdef_init_fb_row_mt(const AV1_COMMON *const cm,
+                             const MACROBLOCKD *const xd,
+                             CdefBlockInfo *const fb_info,
+                             uint16_t **const linebuf, uint16_t *const src,
+                             struct AV1CdefSyncData *const cdef_sync, int fbr);
+void av1_cdef_copy_sb8_16(const AV1_COMMON *const cm, uint16_t *const dst,
+                          int dstride, const uint8_t *src, int src_voffset,
+                          int src_hoffset, int sstride, int vsize, int hsize);
+void av1_alloc_cdef_sync(AV1_COMMON *const cm, AV1CdefSync *cdef_sync,
+                         int num_workers);
+void av1_free_cdef_sync(AV1CdefSync *cdef_sync);
+
 // Deallocate loopfilter synchronization related mutex and data.
 void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+                           int width, int num_workers);
 
 void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                               struct macroblockd *xd, int plane_start,
                               int plane_end, int partial_frame,
-#if CONFIG_LPF_MASK
-                              int is_decoding,
-#endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync);
+                              AV1LfSync *lf_sync, int is_realtime);
 
 #if !CONFIG_REALTIME_ONLY
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
@@ -116,6 +166,9 @@
                                           int num_workers, AV1LrSync *lr_sync,
                                           void *lr_ctxt);
 void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+void av1_loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+                                int num_workers, int num_rows_lr,
+                                int num_planes, int width);
 #endif
 
 #ifdef __cplusplus

diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 1b11bd7..8f5d2a6 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c

@@ -28,7 +28,7 @@
 }
 
 void av1_get_tile_limits(AV1_COMMON *const cm) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   const int mi_cols =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
@@ -130,9 +130,9 @@
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
   assert(row < cm->tiles.rows);
   int mi_row_start = cm->tiles.row_start_sb[row]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_row_end = cm->tiles.row_start_sb[row + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_row = row;
   tile->mi_row_start = mi_row_start;
   tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_params.mi_rows);
@@ -142,9 +142,9 @@
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
   assert(col < cm->tiles.cols);
   int mi_col_start = cm->tiles.col_start_sb[col]
-                     << cm->seq_params.mib_size_log2;
+                     << cm->seq_params->mib_size_log2;
   int mi_col_end = cm->tiles.col_start_sb[col + 1]
-                   << cm->seq_params.mib_size_log2;
+                   << cm->seq_params->mib_size_log2;
   tile->tile_col = col;
   tile->mi_col_start = mi_col_start;
   tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_params.mi_cols);
@@ -153,16 +153,16 @@
 
 int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
   int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      tile.mi_row_end - tile.mi_row_start, cm->seq_params->mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2;
 
   return sb_rows;
 }
 
 int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
   int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
-      tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
-  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      tile.mi_col_end - tile.mi_col_start, cm->seq_params->mib_size_log2);
+  int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params->mib_size_log2;
 
   return sb_cols;
 }
@@ -195,8 +195,8 @@
   r.bottom = AOMMIN(r.bottom, frame_h);
 
   // Convert to coordinates in the appropriate plane
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
 
   r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
   r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
@@ -215,7 +215,7 @@
     for (int i = 0; i < tiles->cols; ++i) {
       const int tile_width_sb =
           tiles->col_start_sb[i + 1] - tiles->col_start_sb[i];
-      const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+      const int tile_w = tile_width_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_w == *w);  // ensure all tiles have same dimension
       *w = tile_w;
     }
@@ -223,7 +223,7 @@
     for (int i = 0; i < tiles->rows; ++i) {
       const int tile_height_sb =
           tiles->row_start_sb[i + 1] - tiles->row_start_sb[i];
-      const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+      const int tile_h = tile_height_sb * cm->seq_params->mib_size;
       assert(i == 0 || tile_h == *h);  // ensure all tiles have same dimension
       *h = tile_h;
     }

diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 93e98e4..87613e8 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c

@@ -4146,8 +4146,8 @@
       transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
     }
     if (rect_type == 1 || rect_type == -1) {
-      av1_round_shift_rect_array_32_avx2(
-          buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+      round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w_div8 << 3,
+                                     0, NewInvSqrt2);
     }
     row_txfm(buf0, buf0, av1_inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
              -shift[0]);
@@ -4169,9 +4169,9 @@
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
              av1_inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
 
-    av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
-                                  buf1 + i * txfm_size_row, txfm_size_row,
-                                  -shift[1]);
+    round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+                              buf1 + i * txfm_size_row, txfm_size_row,
+                              -shift[1]);
   }
 
   // write to buffer

diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 03eaef8..568ee5c 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c

@@ -145,6 +145,74 @@
   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
 }
 
+void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  __m128i op[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  load_buffer_4x4(input, op);
+
+  // Shift before-hand.
+  op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
+  op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
+  op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
+  op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
+
+  for (int i = 0; i < 2; ++i) {
+    transpose_32bit_4x4(op, op);
+
+    __m128i a1 = op[0];
+    __m128i c1 = op[1];
+    __m128i d1 = op[2];
+    __m128i b1 = op[3];
+    a1 = _mm_add_epi32(a1, c1);          // a1 += c1
+    d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
+    __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
+    e1 = _mm_srai_epi32(e1, 1);
+    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
+    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
+    a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
+    d1 = _mm_add_epi32(d1, c1);  // d1 += c1
+
+    op[0] = a1;
+    op[1] = b1;
+    op[2] = c1;
+    op[3] = d1;
+  }
+
+  // Convert to int16_t. The C code checks that we are in range.
+  op[0] = _mm_packs_epi32(op[0], op[1]);
+  op[1] = _mm_packs_epi32(op[2], op[3]);
+
+  // Load uint16_t.
+  __m128i dst[2];
+  __m128i tmp[4];
+  tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
+  dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
+  tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
+  tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
+  dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
+
+  // Add to the previous results.
+  dst[0] = _mm_add_epi16(dst[0], op[0]);
+  dst[1] = _mm_add_epi16(dst[1], op[1]);
+
+  // Clamp.
+  dst[0] = highbd_clamp_epi16(dst[0], bd);
+  dst[1] = highbd_clamp_epi16(dst[1], bd);
+
+  // Store.
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
+  dst[0] = _mm_srli_si128(dst[0], 8);
+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
+  dst[1] = _mm_srli_si128(dst[1], 8);
+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
+}
+
 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
                           __m128i *out1, const __m128i *clamp_lo,
                           const __m128i *clamp_hi) {

diff --git a/av1/common/x86/resize_ssse3.c b/av1/common/x86/resize_ssse3.c
index 6e3780f..0d871de 100644
--- a/av1/common/x86/resize_ssse3.c
+++ b/av1/common/x86/resize_ssse3.c

@@ -11,14 +11,14 @@
  */
 
 #include <tmmintrin.h>  // SSSE3
+#include "config/av1_rtcd.h"
+#include "config/aom_scale_rtcd.h"
 
 #include "aom_dsp/x86/convolve_sse2.h"
 #include "aom_dsp/x86/convolve_ssse3.h"
 #include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/transpose_sse2.h"
 #include "av1/common/resize.h"
-#include "config/av1_rtcd.h"
-#include "config/aom_scale_rtcd.h"
 
 static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
     const uint8_t *const src, const __m128i *const mask) {

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 58629e0..08f81da 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c

@@ -76,12 +76,11 @@
 // Checks that the remaining bits start with a 1 and ends with 0s.
 // It consumes an additional byte, if already byte aligned before the check.
 int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
   // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
   int bits_before_alignment = 8 - rb->bit_offset % 8;
   int trailing = aom_rb_read_literal(rb, bits_before_alignment);
   if (trailing != (1 << (bits_before_alignment - 1))) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
   return 0;
@@ -364,7 +363,7 @@
                                          PARTITION_TYPE partition,
                                          BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int bw = mi_size_wide[bsize];
   const int bh = mi_size_high[bsize];
   const int x_mis = AOMMIN(bw, cm->mi_params.mi_cols - mi_col);
@@ -916,6 +915,14 @@
           if (plane && !xd->is_chroma_ref) break;
           const struct macroblockd_plane *const pd = &xd->plane[plane];
           const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+#if CONFIG_REALTIME_ONLY
+          // Realtime only build doesn't support 4x rectangular txfm sizes.
+          if (tx_size >= TX_4X16) {
+            aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_FEATURE,
+                               "Realtime only build doesn't support 4x "
+                               "rectangular txfm sizes");
+          }
+#endif
           const int stepr = tx_size_high_unit[tx_size];
           const int stepc = tx_size_wide_unit[tx_size];
 
@@ -1017,10 +1024,6 @@
 
 static AOM_INLINE void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
                                           TX_SIZE tx_size, int depth,
-#if CONFIG_LPF_MASK
-                                          AV1_COMMON *cm, int mi_row,
-                                          int mi_col, int store_bitmask,
-#endif
                                           int blk_row, int blk_col,
                                           aom_reader *r) {
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
@@ -1063,32 +1066,15 @@
       mbmi->tx_size = sub_txs;
       txfm_partition_update(xd->above_txfm_context + blk_col,
                             xd->left_txfm_context + blk_row, sub_txs, tx_size);
-#if CONFIG_LPF_MASK
-      if (store_bitmask) {
-        av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                                txsize_to_bsize[tx_size], TX_4X4, mbmi);
-      }
-#endif
       return;
     }
-#if CONFIG_LPF_MASK
-    if (depth + 1 == MAX_VARTX_DEPTH && store_bitmask) {
-      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                              txsize_to_bsize[tx_size], sub_txs, mbmi);
-      store_bitmask = 0;
-    }
-#endif
 
     assert(bsw > 0 && bsh > 0);
     for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
         int offsetr = blk_row + row;
         int offsetc = blk_col + col;
-        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
-#if CONFIG_LPF_MASK
-                           cm, mi_row, mi_col, store_bitmask,
-#endif
-                           offsetr, offsetc, r);
+        read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, r);
       }
     }
   } else {
@@ -1097,12 +1083,6 @@
     mbmi->tx_size = tx_size;
     txfm_partition_update(xd->above_txfm_context + blk_col,
                           xd->left_txfm_context + blk_row, tx_size, tx_size);
-#if CONFIG_LPF_MASK
-    if (store_bitmask) {
-      av1_store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
-                              txsize_to_bsize[tx_size], tx_size, mbmi);
-    }
-#endif
   }
 }
 
@@ -1166,11 +1146,7 @@
 
     for (int idy = 0; idy < height; idy += bh)
       for (int idx = 0; idx < width; idx += bw)
-        read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
-#if CONFIG_LPF_MASK
-                           cm, mi_row, mi_col, 1,
-#endif
-                           idy, idx, r);
+        read_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, r);
   } else {
     mbmi->tx_size = read_tx_size(xd, cm->features.tx_mode, inter_block_tx,
                                  !mbmi->skip_txfm, r);
@@ -1178,35 +1154,7 @@
       memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
     set_txfm_ctxs(mbmi->tx_size, xd->width, xd->height,
                   mbmi->skip_txfm && is_inter_block(mbmi), xd);
-#if CONFIG_LPF_MASK
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-      av1_store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
-    } else {
-      for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
-        for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-          av1_store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
-                                          BLOCK_64X64, mbmi);
-        }
-      }
-    }
-#endif
   }
-#if CONFIG_LPF_MASK
-  const int w = mi_size_wide[bsize];
-  const int h = mi_size_high[bsize];
-  if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
-    av1_store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi, 1, 1);
-  } else {
-    for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
-      for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
-        av1_store_bitmask_other_info(cm, mi_row + row, mi_col + col,
-                                     BLOCK_64X64, mbmi, row == 0, col == 0);
-      }
-    }
-  }
-#endif
 
   if (cm->delta_q_info.delta_q_present_flag) {
     for (int i = 0; i < MAX_SEGMENTS; i++) {
@@ -1221,9 +1169,9 @@
                                       : (j == 1 ? quant_params->u_ac_delta_q
                                                 : quant_params->v_ac_delta_q);
         xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
-            current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+            current_qindex, dc_delta_q, cm->seq_params->bit_depth);
         xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
-            current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+            current_qindex, ac_delta_q, cm->seq_params->bit_depth);
       }
     }
   }
@@ -1332,9 +1280,11 @@
                                                      parse_decode_block };
 
   if (parse_decode_flag & 1) {
-#if !CONFIG_REALTIME_ONLY
     const int num_planes = av1_num_planes(cm);
     for (int plane = 0; plane < num_planes; ++plane) {
+#if CONFIG_REALTIME_ONLY
+      assert(cm->rst_info[plane].frame_restoration_type == RESTORE_NONE);
+#else
       int rcol0, rcol1, rrow0, rrow1;
       if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
                                              &rcol0, &rcol1, &rrow0, &rrow1)) {
@@ -1346,8 +1296,8 @@
           }
         }
       }
-    }
 #endif
+    }
 
     partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
                                     : read_partition(xd, mi_row, mi_col, reader,
@@ -1556,9 +1506,13 @@
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+#if CONFIG_REALTIME_ONLY
+    aom_internal_error(cm->error, AOM_CODEC_UNSUP_FEATURE,
+                       "Realtime only build doesn't support loop restoration");
+#endif
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     for (int p = 0; p < num_planes; ++p)
       cm->rst_info[p].restoration_unit_size = sb_size;
@@ -1578,7 +1532,8 @@
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       cm->rst_info[1].restoration_unit_size =
           cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
@@ -1849,7 +1804,7 @@
 // Build y/uv dequant values based on segmentation.
 static AOM_INLINE void setup_segmentation_dequant(AV1_COMMON *const cm,
                                                   MACROBLOCKD *const xd) {
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   // When segmentation is disabled, only the first value is used.  The
   // remaining are don't cares.
   const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
@@ -1911,7 +1866,7 @@
   cm->superres_upscaled_width = *width;
   cm->superres_upscaled_height = *height;
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) return;
 
   if (aom_rb_read_bit(rb)) {
@@ -1932,7 +1887,7 @@
                                               int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Dimensions of %dx%d beyond allowed size of %dx%d.",
                        width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 #endif
@@ -1952,7 +1907,7 @@
         // consistent and to force a realloc next time.
         cm->width = 0;
         cm->height = 0;
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
       }
     } else {
@@ -1970,7 +1925,7 @@
 
 static AOM_INLINE void setup_buffer_pool(AV1_COMMON *cm) {
   BufferPool *const pool = cm->buffer_pool;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
@@ -1980,7 +1935,7 @@
           &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv,
           0)) {
     unlock_buffer_pool(pool);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
   unlock_buffer_pool(pool);
@@ -2001,7 +1956,7 @@
 static AOM_INLINE void setup_frame_size(AV1_COMMON *cm,
                                         int frame_size_override_flag,
                                         struct aom_read_bit_buffer *rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int width, height;
 
   if (frame_size_override_flag) {
@@ -2010,7 +1965,7 @@
     av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
     if (width > seq_params->max_frame_width ||
         height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Frame dimensions are larger than the maximum values");
     }
   } else {
@@ -2051,7 +2006,7 @@
       // the middle of a stream, and static analysis will error if we don't do
       // a null check here.
       if (ref_buf == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                            "Invalid condition: invalid reference buffer");
       } else {
         const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
@@ -2067,7 +2022,7 @@
     }
   }
 
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!found) {
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
@@ -2079,7 +2034,7 @@
   }
 
   if (width <= 0 || height <= 0)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Invalid frame size");
 
   // Check to make sure at least one of frames that this frame references
@@ -2091,7 +2046,7 @@
                              ref_frame->buf.y_crop_height, width, height);
   }
   if (!has_valid_ref_frame)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
   for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
     const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
@@ -2099,7 +2054,7 @@
             ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
             ref_frame->buf.subsampling_y, seq_params->bit_depth,
             seq_params->subsampling_x, seq_params->subsampling_y))
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Referenced frame has incompatible color format");
   }
   setup_buffer_pool(cm);
@@ -2119,7 +2074,7 @@
 
 static AOM_INLINE void read_tile_info_max_tile(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *const rb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   int width_mi =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, seq_params->mib_size_log2);
@@ -2215,7 +2170,7 @@
     pbi->context_update_tile_id =
         aom_rb_read_literal(rb, cm->tiles.log2_rows + cm->tiles.log2_cols);
     if (pbi->context_update_tile_id >= cm->tiles.rows * cm->tiles.cols) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid context_update_tile_id");
     }
     // tile size magnitude
@@ -2368,7 +2323,7 @@
 
       // Get the whole of the last column, otherwise stop at the required tile.
       for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2380,7 +2335,7 @@
       data = tile_col_data_end[c - 1];
 
       for (int r = 0; r < tile_rows; ++r) {
-        get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+        get_ls_tile_buffer(tile_col_data_end[c], &pbi->error, &data,
                            tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
       }
     }
@@ -2448,11 +2403,11 @@
       if (tc < start_tile || tc > end_tile) continue;
 
       if (data + hdr_offset >= data_end)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Data ended before all tiles were read.");
       data += hdr_offset;
-      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
-                      &pbi->common.error, &data, buf);
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &pbi->error,
+                      &data, buf);
     }
   }
 }
@@ -2462,7 +2417,7 @@
                                      const int num_planes, int mi_row,
                                      int mi_col) {
   AV1_COMMON *const cm = &pbi->common;
-  int mib_size_log2 = cm->seq_params.mib_size_log2;
+  int mib_size_log2 = cm->seq_params->mib_size_log2;
   int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
   CB_BUFFER *cb_buffer = cb_buffer_base + offset;
@@ -2631,11 +2586,11 @@
       pbi->tile_data + tile_info.tile_row * cm->tiles.cols + tile_info.tile_col;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
   const int sb_row_in_tile =
-      (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info.mi_row_start) >> cm->seq_params->mib_size_log2;
   int sb_col_in_tile = 0;
 
   for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-       mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+       mi_col += cm->seq_params->mib_size, sb_col_in_tile++) {
     set_cb_buffer(pbi, &td->dcb, pbi->cb_buffer_base, num_planes, mi_row,
                   mi_col);
 
@@ -2643,7 +2598,7 @@
 
     // Decoding of the super-block
     decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                     cm->seq_params.sb_size, 0x2);
+                     cm->seq_params->sb_size, 0x2);
 
     sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
                sb_cols_in_tile);
@@ -2713,16 +2668,16 @@
   av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
+         mi_col += cm->seq_params->mib_size) {
       set_cb_buffer(pbi, dcb, &td->cb_buffer_base, num_planes, 0, 0);
 
       // Bit-stream parsing and decoding of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x3);
+                       cm->seq_params->sb_size, 0x3);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
         aom_merge_corrupted_flag(&dcb->corrupted, 1);
@@ -2843,7 +2798,7 @@
       av1_tile_init(&td->dcb.xd.tile, cm, row, col);
       td->dcb.xd.current_base_qindex = cm->quant_params.base_qindex;
       setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
-                         &cm->error, td->bit_reader, allow_update_cdf);
+                         &pbi->error, td->bit_reader, allow_update_cdf);
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
         td->bit_reader->accounting = &pbi->accounting;
@@ -2865,7 +2820,7 @@
       decode_tile(pbi, td, row, col);
       aom_merge_corrupted_flag(&pbi->dcb.corrupted, td->dcb.corrupted);
       if (pbi->dcb.corrupted)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Failed to decode tile data");
     }
   }
@@ -3023,7 +2978,7 @@
   const int tile_cols_end = frame_row_mt_info->tile_cols_end;
   const int start_tile = frame_row_mt_info->start_tile;
   const int end_tile = frame_row_mt_info->end_tile;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   int num_mis_to_decode, num_threads_working;
   int num_mis_waiting_for_decode;
   int min_threads_working = INT_MAX;
@@ -3141,7 +3096,7 @@
 static AOM_INLINE void parse_tile_row_mt(AV1Decoder *pbi, ThreadData *const td,
                                          TileDataDec *const tile_data) {
   AV1_COMMON *const cm = &pbi->common;
-  const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_size = mi_size_wide[cm->seq_params->sb_size];
   const int num_planes = av1_num_planes(cm);
   TileInfo tile_info = tile_data->tile_info;
   int tile_row = tile_info.tile_row;
@@ -3154,16 +3109,16 @@
   av1_reset_loop_restoration(xd, num_planes);
 
   for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_zero_left_context(xd);
 
     for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
+         mi_col += cm->seq_params->mib_size) {
       set_cb_buffer(pbi, dcb, pbi->cb_buffer_base, num_planes, mi_row, mi_col);
 
       // Bit-stream parsing of the superblock
       decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
-                       cm->seq_params.sb_size, 0x1);
+                       cm->seq_params->sb_size, 0x1);
 
       if (aom_reader_has_overflowed(td->bit_reader)) {
         aom_merge_corrupted_flag(&dcb->corrupted, 1);
@@ -3495,7 +3450,7 @@
       winterface->init(worker);
       worker->thread_name = "aom tile worker";
       if (worker_idx != 0 && !winterface->reset(worker)) {
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
 
@@ -3512,7 +3467,7 @@
       thread_data->error_info.setjmp = 0;
     }
   }
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   for (worker_idx = 1; worker_idx < pbi->max_threads; ++worker_idx) {
     DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
@@ -3624,7 +3579,7 @@
   sync_dec_workers(pbi, num_workers);
 
   if (pbi->dcb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3642,8 +3597,8 @@
 
 static AOM_INLINE void dec_alloc_cb_buf(AV1Decoder *pbi) {
   AV1_COMMON *const cm = &pbi->common;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
 
   if (pbi->cb_buffer_alloc_size < size) {
     av1_dec_free_cb_buf(pbi);
@@ -3687,10 +3642,10 @@
       tile_data->dec_row_mt_sync.num_threads_working = 0;
       tile_data->dec_row_mt_sync.mi_rows =
           ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
-                             cm->seq_params.mib_size_log2);
+                             cm->seq_params->mib_size_log2);
       tile_data->dec_row_mt_sync.mi_cols =
           ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
-                             cm->seq_params.mib_size_log2);
+                             cm->seq_params->mib_size_log2);
 
       frame_row_mt_info->mi_rows_to_decode +=
           tile_data->dec_row_mt_sync.mi_rows;
@@ -3833,7 +3788,7 @@
   sync_dec_workers(pbi, num_workers);
 
   if (pbi->dcb.corrupted)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Failed to decode tile data");
 
   if (tiles->large_scale) {
@@ -3851,7 +3806,7 @@
 
 static AOM_INLINE void error_handler(void *data) {
   AV1_COMMON *const cm = (AV1_COMMON *)data;
-  aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+  aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
 }
 
 // Reads the high_bitdepth and twelve_bit fields in color_config() and sets
@@ -3882,7 +3837,7 @@
 void av1_read_film_grain_params(AV1_COMMON *cm,
                                 struct aom_read_bit_buffer *rb) {
   aom_film_grain_t *pars = &cm->film_grain_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
 
   pars->apply_grain = aom_rb_read_bit(rb);
   if (!pars->apply_grain) {
@@ -3912,7 +3867,7 @@
       }
     }
     if (!found) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid film grain reference idx %d. ref_frame_idx = "
                          "{%d, %d, %d, %d, %d, %d, %d}",
                          film_grain_params_ref_idx, cm->remapped_ref_idx[0],
@@ -3922,11 +3877,11 @@
     }
     RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
     if (buf == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Invalid Film grain reference idx");
     }
     if (!buf->film_grain_params_present) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Film grain reference parameters not available");
     }
     uint16_t random_seed = pars->random_seed;
@@ -3938,13 +3893,13 @@
   // Scaling functions parameters
   pars->num_y_points = aom_rb_read_literal(rb, 4);  // max 14
   if (pars->num_y_points > 14)
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Number of points for film grain luma scaling function "
                        "exceeds the maximum value.");
   for (int i = 0; i < pars->num_y_points; i++) {
     pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
     if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "First coordinate of the scaling function points "
                          "shall be increasing.");
     pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
@@ -3963,14 +3918,14 @@
   } else {
     pars->num_cb_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cb_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cb scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cb_points; i++) {
       pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
@@ -3978,14 +3933,14 @@
 
     pars->num_cr_points = aom_rb_read_literal(rb, 4);  // max 10
     if (pars->num_cr_points > 10)
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Number of points for film grain cr scaling function "
                          "exceeds the maximum value.");
     for (int i = 0; i < pars->num_cr_points; i++) {
       pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
       if (i &&
           pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "First coordinate of the scaling function points "
                            "shall be increasing.");
       pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
@@ -3994,7 +3949,7 @@
     if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
         (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
          ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "In YCbCr 4:2:0, film grain shall be applied "
                          "to both chroma components or neither.");
   }
@@ -4046,13 +4001,13 @@
 
 static AOM_INLINE void read_film_grain(AV1_COMMON *cm,
                                        struct aom_read_bit_buffer *rb) {
-  if (cm->seq_params.film_grain_params_present &&
+  if (cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     av1_read_film_grain_params(cm, rb);
   } else {
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
-  cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+  cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
   memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
          sizeof(aom_film_grain_t));
 }
@@ -4186,7 +4141,7 @@
 static AOM_INLINE void read_temporal_point_info(
     AV1_COMMON *const cm, struct aom_read_bit_buffer *rb) {
   cm->frame_presentation_time = aom_rb_read_unsigned_literal(
-      rb, cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      rb, cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
@@ -4214,7 +4169,7 @@
     seq_params->frame_id_length =
         aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
     if (seq_params->frame_id_length > 16)
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(cm->error, AOM_CODEC_CORRUPT_FRAME,
                          "Invalid frame_id_length");
   }
 
@@ -4468,7 +4423,7 @@
 static int read_uncompressed_header(AV1Decoder *pbi,
                                     struct aom_read_bit_buffer *rb) {
   AV1_COMMON *const cm = &pbi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &pbi->dcb.xd;
@@ -4479,7 +4434,7 @@
   sframe_info->is_s_frame_at_altref = 0;
 
   if (!pbi->sequence_header_ready) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "No sequence header");
   }
 
@@ -4501,14 +4456,14 @@
     if (cm->show_existing_frame) {
       if (pbi->sequence_header_changed) {
         aom_internal_error(
-            &cm->error, AOM_CODEC_CORRUPT_FRAME,
+            &pbi->error, AOM_CODEC_CORRUPT_FRAME,
             "New sequence header starts with a show_existing_frame.");
       }
       // Show an existing frame directly.
       const int existing_frame_idx = aom_rb_read_literal(rb, 3);
       RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
       if (frame_to_show == NULL) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a decoded frame");
       }
       if (seq_params->decoder_model_info_present_flag &&
@@ -4522,7 +4477,7 @@
          * referencing */
         if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
             pbi->valid_for_referencing[existing_frame_idx] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference buffer frame ID mismatch");
       }
       lock_buffer_pool(pool);
@@ -4543,12 +4498,13 @@
       cm->lf.filter_level[0] = 0;
       cm->lf.filter_level[1] = 0;
       cm->show_frame = 1;
+      current_frame->order_hint = frame_to_show->order_hint;
 
       // Section 6.8.2: It is a requirement of bitstream conformance that when
       // show_existing_frame is used to show a previous frame, that the value
       // of showable_frame for the previous frame was equal to 1.
       if (!frame_to_show->showable_frame) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Buffer does not contain a showable frame");
       }
       // Section 6.8.2: It is a requirement of bitstream conformance that when
@@ -4576,7 +4532,7 @@
         pbi->decoding_first_frame = 1;
         reset_frame_buffers(cm);
       } else {
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Sequence header has changed without a keyframe.");
       }
     }
@@ -4591,7 +4547,7 @@
     }
     if (seq_params->still_picture &&
         (current_frame->frame_type != KEY_FRAME || !cm->show_frame)) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Still pictures must be coded as shown keyframes");
     }
     cm->showable_frame = current_frame->frame_type != KEY_FRAME;
@@ -4663,7 +4619,7 @@
         /* Check current_frame_id for conformance */
         if (prev_frame_id == cm->current_frame_id ||
             diff_frame_id >= (1 << (frame_id_length - 1))) {
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Invalid value of current_frame_id");
         }
       }
@@ -4694,18 +4650,18 @@
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    cm->buffer_removal_time_present = aom_rb_read_bit(rb);
-    if (cm->buffer_removal_time_present) {
+    pbi->buffer_removal_time_present = aom_rb_read_bit(rb);
+    if (pbi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if ((((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              (((seq_params->operating_point_idc[op_num] >>
                  cm->temporal_layer_id) &
                 0x1) &&
                ((seq_params->operating_point_idc[op_num] >>
                  (cm->spatial_layer_id + 8)) &
-                0x1)) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                0x1))) {
             cm->buffer_removal_times[op_num] = aom_rb_read_unsigned_literal(
                 rb, seq_params->decoder_model_info.buffer_removal_time_length);
           } else {
@@ -4735,7 +4691,7 @@
     if (current_frame->frame_type == INTRA_ONLY_FRAME) {
       current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
       if (current_frame->refresh_frame_flags == 0xFF) {
-        aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+        aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                            "Intra only frames cannot have refresh flags 0xFF");
       }
       if (pbi->need_resync) {
@@ -4769,7 +4725,7 @@
           // pixels set to neutral grey.
           int buf_idx = get_free_fb(cm);
           if (buf_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
           buf = &frame_bufs[buf_idx];
@@ -4782,7 +4738,7 @@
                   &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0)) {
             decrease_ref_count(buf, pool);
             unlock_buffer_pool(pool);
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
           unlock_buffer_pool(pool);
@@ -4849,10 +4805,10 @@
         // reference to a slot that hasn't been set yet. That's what we are
         // checking here.
         if (lst_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
         if (gld_buf == NULL)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Inter frame requests nonexistent reference");
 
         av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
@@ -4870,7 +4826,7 @@
           // reference to a slot that hasn't been set yet. That's what we are
           // checking here.
           if (cm->ref_frame_map[ref] == NULL)
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Inter frame requests nonexistent reference");
           cm->remapped_ref_idx[i] = ref;
         } else {
@@ -4878,7 +4834,7 @@
         }
         // Check valid for referencing
         if (pbi->valid_for_referencing[ref] == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+          aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                              "Reference frame not valid for referencing");
 
         cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
@@ -4894,7 +4850,7 @@
           // Compare values derived from delta_frame_id_minus_1 and
           // refresh_frame_flags.
           if (ref_frame_id != cm->ref_frame_id[ref])
-            aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+            aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                                "Reference buffer frame ID mismatch");
         }
       }
@@ -4917,7 +4873,7 @@
     cm->prev_frame = get_primary_ref_frame_buf(cm);
     if (features->primary_ref_frame != PRIMARY_REF_NONE &&
         get_primary_ref_frame_buf(cm) == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "Reference frame containing this frame's initial "
                          "frame context is unavailable.");
     }
@@ -4937,7 +4893,7 @@
             ref_scale_factors, ref_buf->buf.y_crop_width,
             ref_buf->buf.y_crop_height, cm->width, cm->height);
         if ((!av1_is_valid_scale(ref_scale_factors)))
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "Reference frame has invalid dimensions");
       }
     }
@@ -4974,7 +4930,7 @@
   cm->cur_frame->buf.render_height = cm->render_height;
 
   if (pbi->need_resync) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Keyframe / intra-only frame required to reset decoder"
                        " state");
   }
@@ -4995,13 +4951,13 @@
 
   read_tile_info(pbi, rb);
   if (!av1_is_min_tile_width_satisfied(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Minimum tile width requirement not satisfied");
   }
 
   CommonQuantParams *const quant_params = &cm->quant_params;
   setup_quantization(quant_params, av1_num_planes(cm),
-                     cm->seq_params.separate_uv_delta_q, rb);
+                     cm->seq_params->separate_uv_delta_q, rb);
   xd->bd = (int)seq_params->bit_depth;
 
   CommonContexts *const above_contexts = &cm->above_contexts;
@@ -5012,7 +4968,7 @@
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm))) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
     }
   }
@@ -5092,7 +5048,7 @@
   features->reduced_tx_set_used = aom_rb_read_bit(rb);
 
   if (features->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Frame wrongly requests reference frame MVs");
   }
 
@@ -5192,7 +5148,7 @@
       // Use the default frame context values.
       *cm->fc = *cm->default_frame_context;
       if (!cm->fc->initialized)
-        aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+        aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                            "Uninitialized entropy context.");
     }
     return uncomp_hdr_size;
@@ -5200,10 +5156,11 @@
 
   cm->mi_params.setup_mi(&cm->mi_params);
 
-  av1_setup_motion_field(cm);
+  av1_calculate_ref_frame_side(cm);
+  if (cm->features.allow_ref_frame_mvs) av1_setup_motion_field(cm);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
   if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
     // use the default frame context values
     *cm->fc = *cm->default_frame_context;
@@ -5211,7 +5168,7 @@
     *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
   }
   if (!cm->fc->initialized)
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Uninitialized entropy context.");
 
   pbi->dcb.corrupted = 0;
@@ -5229,7 +5186,7 @@
     av1_alloc_restoration_buffers(cm);
   }
 #endif
-  const int use_highbd = cm->seq_params.use_highbitdepth;
+  const int use_highbd = cm->seq_params->use_highbitdepth;
   const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
   if (pbi->td.mc_buf_size != buf_size) {
     av1_free_mc_tmp_buf(&pbi->td);
@@ -5248,9 +5205,6 @@
 
   if (initialize_flag) setup_frame_info(pbi);
   const int num_planes = av1_num_planes(cm);
-#if CONFIG_LPF_MASK
-  av1_loop_filter_frame_init(cm, 0, num_planes);
-#endif
 
   if (pbi->max_threads > 1 && !(tiles->large_scale && !pbi->ext_tile_debug) &&
       pbi->row_mt)
@@ -5264,29 +5218,22 @@
 
   // If the bit stream is monochrome, set the U and V buffers to a constant.
   if (num_planes < 3) {
-    set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+    set_planes_to_neutral_grey(cm->seq_params, xd->cur_buf, 1);
   }
 
   if (end_tile != tiles->rows * tiles->cols - 1) {
     return;
   }
 
+  av1_alloc_cdef_buffers(cm, &pbi->cdef_worker, &pbi->cdef_sync,
+                         pbi->num_workers, 1);
+  av1_alloc_cdef_sync(cm, &pbi->cdef_sync, pbi->num_workers);
+
   if (!cm->features.allow_intrabc && !tiles->single_tile_decoding) {
     if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
-      if (pbi->num_workers > 1) {
-        av1_loop_filter_frame_mt(
-            &cm->cur_frame->buf, cm, &pbi->dcb.xd, 0, num_planes, 0,
-#if CONFIG_LPF_MASK
-            1,
-#endif
-            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
-      } else {
-        av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd,
-#if CONFIG_LPF_MASK
-                              1,
-#endif
-                              0, num_planes, 0);
-      }
+      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &pbi->dcb.xd, 0,
+                               num_planes, 0, pbi->tile_workers,
+                               pbi->num_workers, &pbi->lf_row_sync, 0);
     }
 
     const int do_cdef =
@@ -5307,7 +5254,14 @@
                                                  cm, 0);
 
       if (do_cdef) {
-        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+        if (pbi->num_workers > 1) {
+          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+                            pbi->tile_workers, &pbi->cdef_sync,
+                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+        } else {
+          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+                         av1_cdef_init_fb_row);
+        }
       }
 
       superres_post_decode(pbi);
@@ -5345,14 +5299,18 @@
 #else
     if (!optimized_loop_restoration) {
       if (do_cdef) {
-        av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd);
+        if (pbi->num_workers > 1) {
+          av1_cdef_frame_mt(cm, &pbi->dcb.xd, pbi->cdef_worker,
+                            pbi->tile_workers, &pbi->cdef_sync,
+                            pbi->num_workers, av1_cdef_init_fb_row_mt);
+        } else {
+          av1_cdef_frame(&pbi->common.cur_frame->buf, cm, &pbi->dcb.xd,
+                         av1_cdef_init_fb_row);
+        }
       }
     }
 #endif  // !CONFIG_REALTIME_ONLY
   }
-#if CONFIG_LPF_MASK
-  av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
-#endif
 
   if (!pbi->dcb.corrupted) {
     if (cm->features.refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
@@ -5361,7 +5319,7 @@
       av1_reset_cdf_symbol_counters(cm->fc);
     }
   } else {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Decode failed. Frame data is corrupted.");
   }
 

diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 412be86..839bda2 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c

@@ -46,7 +46,7 @@
 
   // At the start of a superblock, mark that we haven't yet read CDEF strengths
   // for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -61,7 +61,7 @@
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
@@ -85,12 +85,12 @@
                              aom_reader *r, MB_MODE_INFO *const mbmi) {
   int sign, abs, reduced_delta_qindex = 0;
   BLOCK_SIZE bsize = mbmi->bsize;
-  const int b_col = xd->mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = xd->mi_row & (cm->seq_params.mib_size - 1);
+  const int b_col = xd->mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = xd->mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_q_flag = (b_col == 0 && b_row == 0);
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_q_flag) {
     abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_Q_SMALL);
@@ -117,11 +117,11 @@
                               int mi_row) {
   int reduced_delta_lflevel = 0;
   const BLOCK_SIZE bsize = mbmi->bsize;
-  const int b_col = mi_col & (cm->seq_params.mib_size - 1);
-  const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+  const int b_col = mi_col & (cm->seq_params->mib_size - 1);
+  const int b_row = mi_row & (cm->seq_params->mib_size - 1);
   const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
 
-  if ((bsize != cm->seq_params.sb_size || mbmi->skip_txfm == 0) &&
+  if ((bsize != cm->seq_params->sb_size || mbmi->skip_txfm == 0) &&
       read_delta_lf_flag) {
     int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
     const int smallval = (abs < DELTA_LF_SMALL);
@@ -579,7 +579,7 @@
           aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_y(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
   if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED && xd->is_chroma_ref) {
@@ -591,7 +591,7 @@
           aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                           PALETTE_SIZES, ACCT_STR) +
           2;
-      read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+      read_palette_colors_uv(xd, cm->seq_params->bit_depth, pmi, r);
     }
   }
 }
@@ -682,7 +682,7 @@
   mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
   int valid = is_mv_valid(&mv->as_mv) &&
               av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
-                              cm->seq_params.mib_size_log2);
+                              cm->seq_params->mib_size_log2);
   return valid;
 }
 
@@ -711,7 +711,7 @@
     av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
     int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
     if (dv_ref.as_int == 0)
-      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, xd->mi_row);
+      av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params->mib_size, xd->mi_row);
     // Ref DV should not have sub-pel.
     int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
     dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
@@ -816,7 +816,7 @@
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
 
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1076,7 +1076,7 @@
       use_angle_delta && av1_is_directional_mode(mbmi->mode)
           ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
           : 0;
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     mbmi->uv_mode =
         read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
     if (mbmi->uv_mode == UV_CFL_PRED) {
@@ -1375,7 +1375,7 @@
   aom_merge_corrupted_flag(&dcb->corrupted, mv_corrupted_flag);
 
   mbmi->use_wedge_interintra = 0;
-  if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+  if (cm->seq_params->enable_interintra_compound && !mbmi->skip_mode &&
       is_interintra_allowed(mbmi)) {
     const int bsize_group = size_group_lookup[bsize];
     const int interintra =
@@ -1423,7 +1423,7 @@
   if (has_second_ref(mbmi) && !mbmi->skip_mode) {
     // Read idx to indicate current compound inter prediction mode group
     const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                     cm->seq_params.enable_masked_compound;
+                                     cm->seq_params->enable_masked_compound;
 
     if (masked_compound_used) {
       const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1432,7 +1432,7 @@
     }
 
     if (mbmi->comp_group_idx == 0) {
-      if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+      if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
         const int comp_index_ctx = get_comp_index_context(cm, xd);
         mbmi->compound_idx = (uint8_t)aom_read_symbol(
             r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
@@ -1473,7 +1473,7 @@
   }
 
   read_mb_interp_filter(xd, features->interp_filter,
-                        cm->seq_params.enable_dual_filter, mbmi, r);
+                        cm->seq_params->enable_dual_filter, mbmi, r);
 
 #if !CONFIG_REALTIME_ONLY
   if (mbmi->motion_mode == WARPED_CAUSAL) {
@@ -1573,11 +1573,11 @@
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, dcb, r);
-    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       intra_copy_frame_mvs(cm, xd->mi_row, xd->mi_col, x_mis, y_mis);
   } else {
     read_inter_frame_mode_info(pbi, dcb, r);
-    if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+    if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
       av1_copy_frame_mvs(cm, mi, xd->mi_row, xd->mi_col, x_mis, y_mis);
   }
 }

diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 48d0e56..0ec85ee 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c

@@ -19,7 +19,6 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
-#include "aom_ports/system_state.h"
 #include "aom_ports/aom_once.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_scale/aom_scale.h"
@@ -68,10 +67,6 @@
 
   assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
          mi_size_high[mi_params->mi_alloc_bsize]);
-
-#if CONFIG_LPF_MASK
-  av1_alloc_loop_filter_mask(mi_params);
-#endif
 }
 
 static void dec_setup_mi(CommonModeInfoParams *mi_params) {
@@ -97,17 +92,19 @@
   av1_zero(*pbi);
 
   AV1_COMMON *volatile const cm = &pbi->common;
+  cm->seq_params = &pbi->seq_params;
+  cm->error = &pbi->error;
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
+  if (setjmp(pbi->error.jmp)) {
+    pbi->error.setjmp = 0;
     av1_decoder_remove(pbi);
     return NULL;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   CHECK_MEM_ERROR(cm, cm->fc,
                   (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
@@ -129,7 +126,7 @@
   pbi->decoding_first_frame = 1;
   pbi->common.buffer_pool = pool;
 
-  cm->seq_params.bit_depth = AOM_BITS_8;
+  cm->seq_params->bit_depth = AOM_BITS_8;
 
   cm->mi_params.free_mi = dec_free_mi;
   cm->mi_params.setup_mi = dec_setup_mi;
@@ -146,7 +143,7 @@
   aom_accounting_init(&pbi->accounting);
 #endif
 
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   aom_get_worker_interface()->init(&pbi->lf_worker);
   pbi->lf_worker.thread_name = "aom lf worker";
@@ -262,16 +259,16 @@
 
   const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
   if (cfg == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
   if (!equal_dimensions(cfg, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(cfg, sd, num_planes);
 
-  return cm->error.error_code;
+  return pbi->error.error_code;
 }
 
 static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
@@ -294,13 +291,13 @@
   ref_buf = get_ref_frame(cm, idx);
 
   if (ref_buf == NULL) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+    aom_internal_error(cm->error, AOM_CODEC_ERROR, "No reference frame");
     return AOM_CODEC_ERROR;
   }
 
   if (!use_external_ref) {
     if (!equal_dimensions(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer.
@@ -308,7 +305,7 @@
     }
   } else {
     if (!equal_dimensions_and_border(ref_buf, sd)) {
-      aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     } else {
       // Overwrite the reference frame buffer pointers.
@@ -324,7 +321,7 @@
     }
   }
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
@@ -333,12 +330,12 @@
   const int num_planes = av1_num_planes(cm);
 
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 static void release_current_frame(AV1Decoder *pbi) {
@@ -356,7 +353,7 @@
 // Consumes a reference to cm->cur_frame.
 //
 // This functions returns void. It reports failure by setting
-// cm->error.error_code.
+// pbi->error.error_code.
 static void update_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
   int ref_index = 0, mask;
   AV1_COMMON *const cm = &pbi->common;
@@ -389,7 +386,7 @@
           // error
           cm->cur_frame->buf.corrupted = 1;
           decrease_ref_count(cm->cur_frame, pool);
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         } else {
           pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
           pbi->num_output_frames++;
@@ -428,8 +425,8 @@
                                 const uint8_t **psource) {
   AV1_COMMON *volatile const cm = &pbi->common;
   const uint8_t *source = *psource;
-  cm->error.error_code = AOM_CODEC_OK;
-  cm->error.has_detail = 0;
+  pbi->error.error_code = AOM_CODEC_OK;
+  pbi->error.has_detail = 0;
 
   if (size == 0) {
     // This is used to signal that we are missing frames.
@@ -445,18 +442,18 @@
   }
 
   if (assign_cur_frame_new_fb(cm) == NULL) {
-    cm->error.error_code = AOM_CODEC_MEM_ERROR;
+    pbi->error.error_code = AOM_CODEC_MEM_ERROR;
     return 1;
   }
 
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
+  if (setjmp(pbi->error.jmp)) {
     const AVxWorkerInterface *const winterface = aom_get_worker_interface();
     int i;
 
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
 
     // Synchronize all threads immediately as a subsequent decode call may
     // cause a resize invalidating some allocations.
@@ -466,19 +463,18 @@
     }
 
     release_current_frame(pbi);
-    aom_clear_system_state();
     return -1;
   }
 
-  cm->error.setjmp = 1;
+  pbi->error.setjmp = 1;
 
   int frame_decoded =
       aom_decode_frame_from_obus(pbi, source, source + size, psource);
 
   if (frame_decoded < 0) {
-    assert(cm->error.error_code != AOM_CODEC_OK);
+    assert(pbi->error.error_code != AOM_CODEC_OK);
     release_current_frame(pbi);
-    cm->error.setjmp = 0;
+    pbi->error.setjmp = 0;
     return 1;
   }
 
@@ -499,13 +495,11 @@
     pbi->decoding_first_frame = 0;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) {
-    cm->error.setjmp = 0;
+  if (pbi->error.error_code != AOM_CODEC_OK) {
+    pbi->error.setjmp = 0;
     return 1;
   }
 
-  aom_clear_system_state();
-
   if (!cm->show_existing_frame) {
     if (cm->seg.enabled) {
       if (cm->prev_frame &&
@@ -519,7 +513,7 @@
   }
 
   // Update progress in frame parallel decode.
-  cm->error.setjmp = 0;
+  pbi->error.setjmp = 0;
 
   return 0;
 }
@@ -530,7 +524,6 @@
   if (index >= pbi->num_output_frames) return -1;
   *sd = &pbi->output_frames[index]->buf;
   *grain_params = &pbi->output_frames[index]->film_grain_params;
-  aom_clear_system_state();
   return 0;
 }
 

diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 61147f9..226b9dc 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h

@@ -229,6 +229,8 @@
   AV1LfSync lf_row_sync;
   AV1LrSync lr_row_sync;
   AV1LrStruct lr_ctxt;
+  AV1CdefSync cdef_sync;
+  AV1CdefWorkerData *cdef_worker;
   AVxWorker *tile_workers;
   int num_workers;
   DecWorkerData *thread_data;
@@ -332,6 +334,32 @@
   int is_arf_frame_present;
   int num_tile_groups;
   aom_s_frame_info sframe_info;
+
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
 } AV1Decoder;
 
 // Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error

diff --git a/av1/decoder/obu.c b/av1/decoder/obu.c
index f9753cb..6011daf 100644
--- a/av1/decoder/obu.c
+++ b/av1/decoder/obu.c

@@ -69,7 +69,7 @@
                           struct aom_read_bit_buffer *const rb) {
   while (rb->bit_offset & 7) {
     if (aom_rb_read_bit(rb)) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
   }
@@ -110,12 +110,12 @@
 
   // Use a local variable to store the information as we decode. At the end,
   // if no errors have occurred, cm->seq_params is updated.
-  SequenceHeader sh = cm->seq_params;
+  SequenceHeader sh = *cm->seq_params;
   SequenceHeader *const seq_params = &sh;
 
   seq_params->profile = av1_read_profile(rb);
   if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -124,7 +124,7 @@
   seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
   // Video must have reduced_still_picture_hdr = 0
   if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
-    cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+    pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
     return 0;
   }
 
@@ -135,7 +135,7 @@
     seq_params->operating_points_cnt_minus_1 = 0;
     seq_params->operating_point_idc[0] = 0;
     if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
-      cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+      pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
       return 0;
     }
     seq_params->tier[0] = 0;
@@ -144,7 +144,7 @@
   } else {
     seq_params->timing_info_present = aom_rb_read_bit(rb);
     if (seq_params->timing_info_present) {
-      av1_read_timing_info_header(&seq_params->timing_info, &cm->error, rb);
+      av1_read_timing_info_header(&seq_params->timing_info, &pbi->error, rb);
 
       seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
       if (seq_params->decoder_model_info_present_flag)
@@ -159,7 +159,7 @@
       seq_params->operating_point_idc[i] =
           aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
       if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
-        cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+        pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
         return 0;
       }
       // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
@@ -188,7 +188,7 @@
         // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
         // the check
         if (seq_params->op_params[i].bitrate == 0)
-          aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                              "AV1 does not support this combination of "
                              "profile, level, and tier.");
         // Buffer size in bits/s is bitrate in bits/s * 1 s
@@ -212,7 +212,7 @@
               aom_rb_read_literal(rb, 4) + 1;
           if (seq_params->op_params[i].initial_display_delay > 10)
             aom_internal_error(
-                &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                 "AV1 does not support more than 10 decoded frames delay");
         } else {
           seq_params->op_params[i].initial_display_delay = 10;
@@ -232,19 +232,19 @@
   pbi->current_operating_point =
       seq_params->operating_point_idc[operating_point];
   if (aom_get_num_layers_from_operating_point_idc(
-          pbi->current_operating_point, &cm->number_spatial_layers,
-          &cm->number_temporal_layers) != AOM_CODEC_OK) {
-    cm->error.error_code = AOM_CODEC_ERROR;
+          pbi->current_operating_point, &pbi->number_spatial_layers,
+          &pbi->number_temporal_layers) != AOM_CODEC_OK) {
+    pbi->error.error_code = AOM_CODEC_ERROR;
     return 0;
   }
 
   av1_read_sequence_header(cm, rb, seq_params);
 
-  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+  av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &pbi->error);
   if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
       !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+    aom_internal_error(&pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
                        "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
                        "%d %d subsampling is not supported.\n",
                        seq_params->subsampling_x, seq_params->subsampling_y);
@@ -253,18 +253,18 @@
   seq_params->film_grain_params_present = aom_rb_read_bit(rb);
 
   if (av1_check_trailing_bits(pbi, rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
 
   // If a sequence header has been decoded before, we check if the new
   // one is consistent with the old one.
   if (pbi->sequence_header_ready) {
-    if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+    if (!are_seq_headers_consistent(cm->seq_params, seq_params))
       pbi->sequence_header_changed = 1;
   }
 
-  cm->seq_params = *seq_params;
+  *cm->seq_params = *seq_params;
   pbi->sequence_header_ready = 1;
 
   return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
@@ -303,7 +303,7 @@
     tile_start_and_end_present_flag = aom_rb_read_bit(rb);
     if (tile_start_implicit && tile_start_and_end_present_flag) {
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &pbi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
       return -1;
     }
@@ -318,20 +318,20 @@
     *end_tile = aom_rb_read_literal(rb, tile_bits);
   }
   if (*start_tile != pbi->next_start_tile) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_start (%d) must be equal to %d", *start_tile,
                        pbi->next_start_tile);
     return -1;
   }
   if (*start_tile > *end_tile) {
     aom_internal_error(
-        &cm->error, AOM_CODEC_CORRUPT_FRAME,
+        &pbi->error, AOM_CODEC_CORRUPT_FRAME,
         "tg_end (%d) must be greater than or equal to tg_start (%d)", *end_tile,
         *start_tile);
     return -1;
   }
   if (*end_tile >= num_tiles) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "tg_end (%d) must be less than NumTiles (%d)", *end_tile,
                        num_tiles);
     return -1;
@@ -388,15 +388,16 @@
              (pbi->output_frame_height_in_tiles_minus_1 + 1));
 
   // Allocate the tile list output buffer.
-  // Note: if cm->seq_params.use_highbitdepth is 1 and cm->seq_params.bit_depth
-  // is 8, we could allocate less memory, namely, 8 bits/pixel.
+  // Note: if cm->seq_params->use_highbitdepth is 1 and
+  // cm->seq_params->bit_depth is 8, we could allocate less memory, namely, 8
+  // bits/pixel.
   if (aom_alloc_frame_buffer(&pbi->tile_list_outbuf, output_frame_width,
-                             output_frame_height, cm->seq_params.subsampling_x,
-                             cm->seq_params.subsampling_y,
-                             (cm->seq_params.use_highbitdepth &&
-                              (cm->seq_params.bit_depth > AOM_BITS_8)),
+                             output_frame_height, cm->seq_params->subsampling_x,
+                             cm->seq_params->subsampling_y,
+                             (cm->seq_params->use_highbitdepth &&
+                              (cm->seq_params->bit_depth > AOM_BITS_8)),
                              0, cm->features.byte_alignment))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate the tile list output buffer");
 }
 
@@ -430,8 +431,8 @@
   av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
   const int tile_width_in_pixels = tile_width * MI_SIZE;
   const int tile_height_in_pixels = tile_height * MI_SIZE;
-  const int ssy = cm->seq_params.subsampling_y;
-  const int ssx = cm->seq_params.subsampling_x;
+  const int ssy = cm->seq_params->subsampling_y;
+  const int ssx = cm->seq_params->subsampling_x;
   const int num_planes = av1_num_planes(cm);
 
   YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
@@ -455,8 +456,8 @@
     int vstart2 = tr * h;
     int hstart2 = tc * w;
 
-    if (cm->seq_params.use_highbitdepth &&
-        cm->seq_params.bit_depth == AOM_BITS_8) {
+    if (cm->seq_params->use_highbitdepth &&
+        cm->seq_params->bit_depth == AOM_BITS_8) {
       yv12_tile_copy(cur_frame, hstart1, hend1, vstart1, vend1,
                      &pbi->tile_list_outbuf, hstart2, vstart2, plane);
     } else {
@@ -501,7 +502,7 @@
   pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
   pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
   if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
 
@@ -524,7 +525,7 @@
     // Set reference for each tile.
     int ref_idx = aom_rb_read_literal(rb, 8);
     if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     av1_set_reference_dec(cm, cm->remapped_ref_idx[0], 1,
@@ -535,14 +536,14 @@
     if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
         pbi->dec_tile_row >= cm->tiles.rows ||
         pbi->dec_tile_col >= cm->tiles.cols) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
     pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
     data += tile_info_bytes;
     if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
 
@@ -581,18 +582,17 @@
                                 OBU_METADATA_TYPE metadata_type,
                                 const uint8_t *data, size_t sz,
                                 aom_metadata_insert_flags_t insert_flag) {
-  AV1_COMMON *const cm = &pbi->common;
   if (!pbi->metadata) {
     pbi->metadata = aom_img_metadata_array_alloc(0);
     if (!pbi->metadata) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate metadata array");
     }
   }
   aom_metadata_t *metadata =
       aom_img_metadata_alloc(metadata_type, data, sz, insert_flag);
   if (!metadata) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Error allocating metadata");
   }
   aom_metadata_t **metadata_array =
@@ -600,7 +600,7 @@
                                  (pbi->metadata->sz + 1) * sizeof(metadata));
   if (!metadata_array) {
     aom_img_metadata_free(metadata);
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR,
                        "Error growing metadata array");
   }
   pbi->metadata->metadata_array = metadata_array;
@@ -611,22 +611,21 @@
 // On failure, calls aom_internal_error() and does not return.
 static void read_metadata_itut_t35(AV1Decoder *const pbi, const uint8_t *data,
                                    size_t sz) {
-  AV1_COMMON *const cm = &pbi->common;
   if (sz == 0) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "itu_t_t35_country_code is missing");
   }
   int country_code_size = 1;
   if (*data == 0xFF) {
     if (sz == 1) {
-      aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+      aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                          "itu_t_t35_country_code_extension_byte is missing");
     }
     ++country_code_size;
   }
   int end_index = get_last_nonzero_byte_index(data, sz);
   if (end_index < country_code_size) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "No trailing bits found in ITU-T T.35 metadata OBU");
   }
   // itu_t_t35_payload_bytes is byte aligned. Section 6.7.2 of the spec says:
@@ -634,7 +633,7 @@
   //   specified in Recommendation ITU-T T.35.
   // Therefore the first trailing byte should be 0x80.
   if (data[end_index] != 0x80) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "The last nonzero byte of the ITU-T T.35 metadata OBU "
                        "is 0x%02x, should be 0x80.",
                        data[end_index]);
@@ -648,9 +647,8 @@
 static size_t read_metadata_hdr_cll(AV1Decoder *const pbi, const uint8_t *data,
                                     size_t sz) {
   const size_t kHdrCllPayloadSize = 4;
-  AV1_COMMON *const cm = &pbi->common;
   if (sz < kHdrCllPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR CLL metadata payload size");
   }
   alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_CLL, data, kHdrCllPayloadSize,
@@ -663,9 +661,8 @@
 static size_t read_metadata_hdr_mdcv(AV1Decoder *const pbi, const uint8_t *data,
                                      size_t sz) {
   const size_t kMdcvPayloadSize = 24;
-  AV1_COMMON *const cm = &pbi->common;
   if (sz < kMdcvPayloadSize) {
-    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+    aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME,
                        "Incorrect HDR MDCV metadata payload size");
   }
   alloc_read_metadata(pbi, OBU_METADATA_TYPE_HDR_MDCV, data, kMdcvPayloadSize,
@@ -770,11 +767,10 @@
 // pbi->common.error.error_code and returns 0, or calls aom_internal_error()
 // and does not return.
 static size_t read_metadata(AV1Decoder *pbi, const uint8_t *data, size_t sz) {
-  AV1_COMMON *const cm = &pbi->common;
   size_t type_length;
   uint64_t type_value;
   if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return 0;
   }
   const OBU_METADATA_TYPE metadata_type = (OBU_METADATA_TYPE)type_value;
@@ -782,7 +778,7 @@
     // If metadata_type is reserved for future use or a user private value,
     // ignore the entire OBU and just check trailing bits.
     if (get_last_nonzero_byte(data + type_length, sz - type_length) == 0) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -796,7 +792,7 @@
         type_length +
         read_metadata_hdr_cll(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -805,7 +801,7 @@
         type_length +
         read_metadata_hdr_mdcv(pbi, data + type_length, sz - type_length);
     if (get_last_nonzero_byte(data + bytes_read, sz - bytes_read) != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
     return sz;
@@ -820,7 +816,7 @@
     read_metadata_timecode(&rb);
   }
   if (av1_check_trailing_bits(pbi, &rb) != 0) {
-    // cm->error.error_code is already set.
+    // pbi->error.error_code is already set.
     return 0;
   }
   assert((rb.bit_offset & 7) == 0);
@@ -838,7 +834,7 @@
     // trailing byte should be 0x80. See https://crbug.com/aomedia/2393.
     const uint8_t last_nonzero_byte = get_last_nonzero_byte(data, sz);
     if (last_nonzero_byte != 0x80) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      cm->error->error_code = AOM_CODEC_CORRUPT_FRAME;
       return 0;
     }
   }
@@ -846,7 +842,7 @@
 }
 
 // On success, returns a boolean that indicates whether the decoding of the
-// current frame is finished. On failure, sets cm->error.error_code and
+// current frame is finished. On failure, sets pbi->error.error_code and
 // returns -1.
 int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
                                const uint8_t *data_end,
@@ -872,7 +868,7 @@
   pbi->num_tile_groups = 0;
 
   if (data_end < data) {
-    cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+    pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
     return -1;
   }
 
@@ -880,7 +876,7 @@
   if (!cm->tiles.large_scale) pbi->camera_frame_header_ready = 0;
 
   // decode frame as a series of OBUs
-  while (!frame_decoding_finished && cm->error.error_code == AOM_CODEC_OK) {
+  while (!frame_decoding_finished && pbi->error.error_code == AOM_CODEC_OK) {
     struct aom_read_bit_buffer rb;
     size_t payload_size = 0;
     size_t decoded_payload_size = 0;
@@ -890,7 +886,7 @@
 
     if (bytes_available == 0 && !pbi->seen_frame_header) {
       *p_data_end = data;
-      cm->error.error_code = AOM_CODEC_OK;
+      pbi->error.error_code = AOM_CODEC_OK;
       break;
     }
 
@@ -899,7 +895,7 @@
                                      &obu_header, &payload_size, &bytes_read);
 
     if (status != AOM_CODEC_OK) {
-      cm->error.error_code = status;
+      pbi->error.error_code = status;
       return -1;
     }
 
@@ -912,7 +908,7 @@
     data += bytes_read;
 
     if ((size_t)(data_end - data) < payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -936,16 +932,16 @@
         if (pbi->seen_frame_header) {
           // A new temporal unit has started, but the frame in the previous
           // temporal unit is incomplete.
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         break;
       case OBU_SEQUENCE_HEADER:
         decoded_payload_size = read_sequence_header_obu(pbi, &rb);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         // The sequence header should not change in the middle of a frame.
         if (pbi->sequence_header_changed && pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         break;
@@ -954,13 +950,13 @@
       case OBU_FRAME:
         if (obu_header.type == OBU_REDUNDANT_FRAME_HEADER) {
           if (!pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         } else {
           // OBU_FRAME_HEADER or OBU_FRAME.
           if (pbi->seen_frame_header) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
         }
@@ -978,7 +974,7 @@
           // frame_header_obu.
           if (frame_header_size > payload_size ||
               memcmp(data, frame_header, frame_header_size) != 0) {
-            cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+            pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
             return -1;
           }
           assert(rb.bit_offset == 0);
@@ -992,7 +988,7 @@
 
         if (cm->show_existing_frame) {
           if (obu_header.type == OBU_FRAME) {
-            cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+            pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
             return -1;
           }
           frame_decoding_finished = 1;
@@ -1014,23 +1010,23 @@
         if (obu_header.type != OBU_FRAME) break;
         obu_payload_offset = frame_header_size;
         // Byte align the reader before reading the tile group.
-        // byte_alignment() has set cm->error.error_code if it returns -1.
+        // byte_alignment() has set pbi->error.error_code if it returns -1.
         if (byte_alignment(cm, &rb)) return -1;
         AOM_FALLTHROUGH_INTENDED;  // fall through to read tile group.
       case OBU_TILE_GROUP:
         if (!pbi->seen_frame_header) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         if (obu_payload_offset > payload_size) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size += read_one_tile_group_obu(
             pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
             data + payload_size, p_data_end, &frame_decoding_finished,
             obu_header.type == OBU_FRAME);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         is_first_tg_obu_received = 0;
         if (frame_decoding_finished) {
           pbi->seen_frame_header = 0;
@@ -1040,18 +1036,18 @@
         break;
       case OBU_METADATA:
         decoded_payload_size = read_metadata(pbi, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_TILE_LIST:
         if (CONFIG_NORMAL_TILE_MODE) {
-          cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+          pbi->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
           return -1;
         }
 
         // This OBU type is purely for the large scale tile coding mode.
         // The common camera frame header has to be already decoded.
         if (!pbi->camera_frame_header_ready) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
 
@@ -1060,17 +1056,17 @@
         decoded_payload_size =
             read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
                                           p_data_end, &frame_decoding_finished);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       case OBU_PADDING:
         decoded_payload_size = read_padding(cm, data, payload_size);
-        if (cm->error.error_code != AOM_CODEC_OK) return -1;
+        if (pbi->error.error_code != AOM_CODEC_OK) return -1;
         break;
       default:
         // Skip unrecognized OBUs
         if (payload_size > 0 &&
             get_last_nonzero_byte(data, payload_size) == 0) {
-          cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+          pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
           return -1;
         }
         decoded_payload_size = payload_size;
@@ -1079,7 +1075,7 @@
 
     // Check that the signalled OBU size matches the actual amount of data read
     if (decoded_payload_size > payload_size) {
-      cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+      pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
       return -1;
     }
 
@@ -1087,7 +1083,7 @@
     while (decoded_payload_size < payload_size) {
       uint8_t padding_byte = data[decoded_payload_size++];
       if (padding_byte != 0) {
-        cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+        pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME;
         return -1;
       }
     }
@@ -1095,6 +1091,6 @@
     data += payload_size;
   }
 
-  if (cm->error.error_code != AOM_CODEC_OK) return -1;
+  if (pbi->error.error_code != AOM_CODEC_OK) return -1;
   return frame_decoding_finished;
 }

diff --git a/av1/encoder/allintra_vis.c b/av1/encoder/allintra_vis.c
new file mode 100644
index 0000000..61fea0e
--- /dev/null
+++ b/av1/encoder/allintra_vis.c

@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/encoder/allintra_vis.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rdopt_utils.h"
+
+// Process the wiener variance in 16x16 block basis.
+static int qsort_comp(const void *elem1, const void *elem2) {
+  int a = *((const int *)elem1);
+  int b = *((const int *)elem2);
+  if (a > b) return 1;
+  if (a < b) return -1;
+  return 0;
+}
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  cpi->weber_bsize = BLOCK_8X8;
+
+  if (cpi->mb_weber_stats) return;
+
+  CHECK_MEM_ERROR(cm, cpi->mb_weber_stats,
+                  aom_calloc(cpi->frame_info.mi_rows * cpi->frame_info.mi_cols,
+                             sizeof(*cpi->mb_weber_stats)));
+}
+
+static int64_t get_satd(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                        int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t satd = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      satd += cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+                  .satd;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) satd = (int)(satd / mb_count);
+  satd = AOMMAX(1, satd);
+
+  return (int)satd;
+}
+
+static int64_t get_sse(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                       int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  int64_t distortion = 0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      distortion +=
+          cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)]
+              .distortion;
+      ++mb_count;
+    }
+  }
+
+  if (mb_count) distortion = (int)(distortion / mb_count);
+  distortion = AOMMAX(1, distortion);
+
+  return (int)distortion;
+}
+
+static double get_max_scale(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int mb_stride = cpi->frame_info.mi_cols;
+  double min_max_scale = 10.0;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+      if (weber_stats->max_scale < 1.0) continue;
+      if (weber_stats->max_scale < min_max_scale)
+        min_max_scale = weber_stats->max_scale;
+    }
+  }
+  return min_max_scale;
+}
+
+static int get_window_wiener_var(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  const int mi_step = mi_size_wide[cpi->weber_bsize];
+  int sb_wiener_var = 0;
+  int mb_stride = cpi->frame_info.mi_cols;
+  int mb_count = 0;
+  double base_num = 1;
+  double base_den = 1;
+  double base_reg = 1;
+
+  for (int row = mi_row; row < mi_row + mi_high; row += mi_step) {
+    for (int col = mi_col; col < mi_col + mi_wide; col += mi_step) {
+      if (row >= cm->mi_params.mi_rows || col >= cm->mi_params.mi_cols)
+        continue;
+
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(row / mi_step) * mb_stride + (col / mi_step)];
+
+      base_num += ((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_variance) *
+                  weber_stats->rec_pix_max;
+
+      base_den += fabs(
+          weber_stats->rec_pix_max * sqrt((double)weber_stats->src_variance) -
+          weber_stats->src_pix_max * sqrt((double)weber_stats->rec_variance));
+
+      base_reg += sqrt((double)weber_stats->distortion) *
+                  sqrt((double)weber_stats->src_pix_max) * 0.1;
+      ++mb_count;
+    }
+  }
+
+  sb_wiener_var = (int)((base_num + base_reg) / (base_den + base_reg));
+  sb_wiener_var = AOMMAX(1, sb_wiener_var);
+
+  return (int)sb_wiener_var;
+}
+
+static int get_var_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int mi_wide = mi_size_wide[bsize];
+  const int mi_high = mi_size_high[bsize];
+
+  int sb_wiener_var = get_window_wiener_var(cpi, bsize, mi_row, mi_col);
+
+  if (mi_row >= (mi_high / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row - mi_high / 2, mi_col));
+  }
+  if (mi_row <= (cm->mi_params.mi_rows - mi_high - (mi_high / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row + mi_high / 2, mi_col));
+  }
+  if (mi_col >= (mi_wide / 2)) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col - mi_wide / 2));
+  }
+  if (mi_col <= (cm->mi_params.mi_cols - mi_wide - (mi_wide / 2))) {
+    sb_wiener_var =
+        AOMMIN(sb_wiener_var,
+               get_window_wiener_var(cpi, bsize, mi_row, mi_col + mi_wide / 2));
+  }
+
+  return sb_wiener_var;
+}
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  uint8_t *buffer = cpi->source->y_buffer;
+  int buf_stride = cpi->source->y_stride;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO mbmi;
+  memset(&mbmi, 0, sizeof(mbmi));
+  MB_MODE_INFO *mbmi_ptr = &mbmi;
+  xd->mi = &mbmi_ptr;
+  xd->cur_buf = cpi->source;
+
+  const SequenceHeader *const seq_params = cm->seq_params;
+  if (aom_realloc_frame_buffer(
+          &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
+          seq_params->subsampling_y, seq_params->use_highbitdepth,
+          cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
+          NULL, cpi->oxcf.tool_cfg.enable_global_motion))
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
+
+  cm->quant_params.base_qindex = cpi->oxcf.rc_cfg.cq_level;
+  av1_frame_init_quantizer(cpi);
+
+  DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+
+  int mi_row, mi_col;
+
+  BLOCK_SIZE bsize = cpi->weber_bsize;
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int block_size = tx_size_wide[tx_size];
+  const int coeff_count = block_size * block_size;
+
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  cpi->norm_wiener_variance = 0;
+  int mb_step = mi_size_wide[bsize];
+
+  for (mi_row = 0; mi_row < cpi->frame_info.mi_rows; mi_row += mb_step) {
+    for (mi_col = 0; mi_col < cpi->frame_info.mi_cols; mi_col += mb_step) {
+      PREDICTION_MODE best_mode = DC_PRED;
+      int best_intra_cost = INT_MAX;
+
+      xd->up_available = mi_row > 0;
+      xd->left_available = mi_col > 0;
+
+      const int mi_width = mi_size_wide[bsize];
+      const int mi_height = mi_size_high[bsize];
+      set_mode_info_offsets(&cpi->common.mi_params, &cpi->mbmi_ext_info, x, xd,
+                            mi_row, mi_col);
+      set_mi_row_col(xd, &xd->tile, mi_row, mi_height, mi_col, mi_width,
+                     cm->mi_params.mi_rows, cm->mi_params.mi_cols);
+      set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize],
+                   av1_num_planes(cm));
+      xd->mi[0]->bsize = bsize;
+      xd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+      av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row,
+                           mi_col, 0, av1_num_planes(cm));
+
+      int dst_buffer_stride = xd->plane[0].dst.stride;
+      uint8_t *dst_buffer = xd->plane[0].dst.buf;
+      uint8_t *mb_buffer =
+          buffer + mi_row * MI_SIZE * buf_stride + mi_col * MI_SIZE;
+
+      for (PREDICTION_MODE mode = INTRA_MODE_START; mode < INTRA_MODE_END;
+           ++mode) {
+        av1_predict_intra_block(
+            xd, cm->seq_params->sb_size,
+            cm->seq_params->enable_intra_edge_filter, block_size, block_size,
+            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+            dst_buffer_stride, dst_buffer, dst_buffer_stride, 0, 0, 0);
+
+        av1_subtract_block(bd_info, block_size, block_size, src_diff,
+                           block_size, mb_buffer, buf_stride, dst_buffer,
+                           dst_buffer_stride);
+        av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+        int intra_cost = aom_satd(coeff, coeff_count);
+        if (intra_cost < best_intra_cost) {
+          best_intra_cost = intra_cost;
+          best_mode = mode;
+        }
+      }
+
+      int idx;
+      av1_predict_intra_block(xd, cm->seq_params->sb_size,
+                              cm->seq_params->enable_intra_edge_filter,
+                              block_size, block_size, tx_size, best_mode, 0, 0,
+                              FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
+                              dst_buffer, dst_buffer_stride, 0, 0, 0);
+      av1_subtract_block(bd_info, block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, dst_buffer, dst_buffer_stride);
+      av1_quick_txfm(0, tx_size, bd_info, src_diff, block_size, coeff);
+
+      const struct macroblock_plane *const p = &x->plane[0];
+      uint16_t eob;
+      const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
+      QUANT_PARAM quant_param;
+      int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+      av1_setup_quant(tx_size, 0, AV1_XFORM_QUANT_FP, 0, &quant_param);
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (is_cur_buf_hbd(xd)) {
+        av1_highbd_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                                      scan_order, &quant_param);
+      } else {
+        av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                               scan_order, &quant_param);
+      }
+#else
+      av1_quantize_fp_facade(coeff, pix_num, p, qcoeff, dqcoeff, &eob,
+                             scan_order, &quant_param);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+      av1_inverse_transform_block(xd, dqcoeff, 0, DCT_DCT, tx_size, dst_buffer,
+                                  dst_buffer_stride, eob, 0);
+      WeberStats *weber_stats =
+          &cpi->mb_weber_stats[(mi_row / mb_step) * cpi->frame_info.mi_cols +
+                               (mi_col / mb_step)];
+
+      weber_stats->rec_pix_max = 1;
+      weber_stats->rec_variance = 0;
+      weber_stats->src_pix_max = 1;
+      weber_stats->src_variance = 0;
+      weber_stats->distortion = 0;
+
+      int64_t src_mean = 0;
+      int64_t rec_mean = 0;
+      int64_t dist_mean = 0;
+
+      for (int pix_row = 0; pix_row < block_size; ++pix_row) {
+        for (int pix_col = 0; pix_col < block_size; ++pix_col) {
+          int src_pix, rec_pix;
+#if CONFIG_AV1_HIGHBITDEPTH
+          if (is_cur_buf_hbd(xd)) {
+            uint16_t *src = CONVERT_TO_SHORTPTR(mb_buffer);
+            uint16_t *rec = CONVERT_TO_SHORTPTR(dst_buffer);
+            src_pix = src[pix_row * buf_stride + pix_col];
+            rec_pix = rec[pix_row * dst_buffer_stride + pix_col];
+          } else {
+            src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+            rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+          }
+#else
+          src_pix = mb_buffer[pix_row * buf_stride + pix_col];
+          rec_pix = dst_buffer[pix_row * dst_buffer_stride + pix_col];
+#endif
+          src_mean += src_pix;
+          rec_mean += rec_pix;
+          dist_mean += src_pix - rec_pix;
+          weber_stats->src_variance += src_pix * src_pix;
+          weber_stats->rec_variance += rec_pix * rec_pix;
+          weber_stats->src_pix_max = AOMMAX(weber_stats->src_pix_max, src_pix);
+          weber_stats->rec_pix_max = AOMMAX(weber_stats->rec_pix_max, rec_pix);
+          weber_stats->distortion += (src_pix - rec_pix) * (src_pix - rec_pix);
+        }
+      }
+
+      weber_stats->src_variance -= (src_mean * src_mean) / pix_num;
+      weber_stats->rec_variance -= (rec_mean * rec_mean) / pix_num;
+      weber_stats->distortion -= (dist_mean * dist_mean) / pix_num;
+      weber_stats->satd = best_intra_cost;
+
+      qcoeff[0] = 0;
+      for (idx = 1; idx < coeff_count; ++idx) qcoeff[idx] = abs(qcoeff[idx]);
+      qsort(qcoeff, coeff_count, sizeof(*coeff), qsort_comp);
+
+      weber_stats->max_scale = (double)qcoeff[coeff_count - 1];
+    }
+  }
+
+  int sb_step = mi_size_wide[cm->seq_params->sb_size];
+  double sb_wiener_log = 0;
+  double sb_count = 0;
+
+  for (mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sb_step) {
+    for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sb_step) {
+      int sb_wiener_var =
+          get_var_perceptual_ai(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+      int64_t satd = get_satd(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+      int64_t sse = get_sse(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+      double scaled_satd = (double)satd / sqrt((double)sse);
+      sb_wiener_log += scaled_satd * log(sb_wiener_var);
+      sb_count += scaled_satd;
+    }
+  }
+
+  if (sb_count > 0)
+    cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+  cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+
+  for (int its_cnt = 0; its_cnt < 2; ++its_cnt) {
+    sb_wiener_log = 0;
+    sb_count = 0;
+    for (mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += sb_step) {
+      for (mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += sb_step) {
+        int sb_wiener_var =
+            get_var_perceptual_ai(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+
+        double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+        double min_max_scale = AOMMAX(
+            1.0, get_max_scale(cpi, cm->seq_params->sb_size, mi_row, mi_col));
+        beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+        beta = AOMMIN(beta, 4);
+        beta = AOMMAX(beta, 0.25);
+
+        sb_wiener_var = (int)(cpi->norm_wiener_variance / beta);
+
+        int64_t satd = get_satd(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+        int64_t sse = get_sse(cpi, cm->seq_params->sb_size, mi_row, mi_col);
+        double scaled_satd = (double)satd / sqrt((double)sse);
+        sb_wiener_log += scaled_satd * log(sb_wiener_var);
+        sb_count += scaled_satd;
+      }
+    }
+
+    if (sb_count > 0)
+      cpi->norm_wiener_variance = (int64_t)(exp(sb_wiener_log / sb_count));
+    cpi->norm_wiener_variance = AOMMAX(1, cpi->norm_wiener_variance);
+  }
+
+  aom_free_frame_buffer(&cm->cur_frame->buf);
+}
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  int sb_wiener_var = get_var_perceptual_ai(cpi, bsize, mi_row, mi_col);
+  int offset = 0;
+  double beta = (double)cpi->norm_wiener_variance / sb_wiener_var;
+  double min_max_scale = AOMMAX(1.0, get_max_scale(cpi, bsize, mi_row, mi_col));
+  beta = 1.0 / AOMMIN(1.0 / beta, min_max_scale);
+
+  // Cap beta such that the delta q value is not much far away from the base q.
+  beta = AOMMIN(beta, 4);
+  beta = AOMMAX(beta, 0.25);
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 20 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 20 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+  if (base_qindex > MINQ) qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}
+
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi) {
+  AV1_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_delta_q) return;
+
+  CHECK_MEM_ERROR(cm, cpi->mb_delta_q,
+                  aom_calloc(cpi->frame_info.mb_rows * cpi->frame_info.mb_cols,
+                             sizeof(*cpi->mb_delta_q)));
+}
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi) {
+  const AV1_COMMON *cm = &cpi->common;
+  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->source->y_buffer;
+  const int y_stride = cpi->source->y_stride;
+  const int block_size = cpi->common.seq_params->sb_size;
+
+  const int num_mi_w = mi_size_wide[block_size];
+  const int num_mi_h = mi_size_high[block_size];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int num_rows = (mi_params->mi_rows + num_mi_h - 1) / num_mi_h;
+  const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int *mb_delta_q[2];
+  CHECK_MEM_ERROR(cm, mb_delta_q[0],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[0])));
+  CHECK_MEM_ERROR(cm, mb_delta_q[1],
+                  aom_calloc(num_rows * num_cols, sizeof(*mb_delta_q[1])));
+
+  // Approximates the model change between current version (Spet 2021) and the
+  // baseline (July 2021).
+  const double model_change[] = { 3.0, 3.0 };
+  // The following parameters are fitted from user labeled data.
+  const double a[] = { -24.50 * 4.0, -17.20 * 4.0 };
+  const double b[] = { 0.004898, 0.003093 };
+  const double c[] = { (29.932 + model_change[0]) * 4.0,
+                       (42.100 + model_change[1]) * 4.0 };
+  int delta_q_avg[2] = { 0, 0 };
+  // Loop through each SB block.
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      // Loop through each 8x8 block.
+      for (int mi_row = row * num_mi_h;
+           mi_row < mi_params->mi_rows && mi_row < (row + 1) * num_mi_h;
+           mi_row += 2) {
+        for (int mi_col = col * num_mi_w;
+             mi_col < mi_params->mi_cols && mi_col < (col + 1) * num_mi_w;
+             mi_col += 2) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 2;
+          const int col_offset_y = mi_col << 2;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          unsigned int block_variance;
+          if (use_hbd) {
+            block_variance = av1_high_get_sby_perpixel_variance(
+                cpi, &buf, BLOCK_8X8, xd->bd);
+          } else {
+            block_variance =
+                av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_8X8);
+          }
+
+          block_variance = AOMMAX(block_variance, 1);
+          var += log((double)block_variance);
+          num_of_var += 1.0;
+        }
+      }
+      var = exp(var / num_of_var);
+      mb_delta_q[0][index] = (int)(a[0] * exp(-b[0] * var) + c[0] + 0.5);
+      mb_delta_q[1][index] = (int)(a[1] * exp(-b[1] * var) + c[1] + 0.5);
+      delta_q_avg[0] += mb_delta_q[0][index];
+      delta_q_avg[1] += mb_delta_q[1][index];
+    }
+  }
+
+  delta_q_avg[0] = RINT((double)delta_q_avg[0] / (num_rows * num_cols));
+  delta_q_avg[1] = RINT((double)delta_q_avg[1] / (num_rows * num_cols));
+
+  int model_idx;
+  double scaling_factor;
+  const int cq_level = cpi->oxcf.rc_cfg.cq_level;
+  if (cq_level < delta_q_avg[0]) {
+    model_idx = 0;
+    scaling_factor = 1.0;
+  } else if (cq_level < delta_q_avg[1]) {
+    model_idx = 2;
+    scaling_factor =
+        (double)(cq_level - delta_q_avg[0]) / (delta_q_avg[1] - delta_q_avg[0]);
+  } else {
+    model_idx = 1;
+    scaling_factor = (double)(MAXQ - cq_level) / (MAXQ - delta_q_avg[1]);
+  }
+
+  const double new_delta_q_avg =
+      delta_q_avg[0] + scaling_factor * (delta_q_avg[1] - delta_q_avg[0]);
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      if (model_idx == 2) {
+        const double delta_q =
+            mb_delta_q[0][index] +
+            scaling_factor * (mb_delta_q[1][index] - mb_delta_q[0][index]);
+        cpi->mb_delta_q[index] = RINT(delta_q - new_delta_q_avg);
+      } else {
+        cpi->mb_delta_q[index] =
+            RINT(scaling_factor *
+                 (mb_delta_q[model_idx][index] - delta_q_avg[model_idx]));
+      }
+    }
+  }
+
+  aom_free(mb_delta_q[0]);
+  aom_free(mb_delta_q[1]);
+}
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col) {
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
+  const CommonModeInfoParams *const mi_params = &cpi->common.mi_params;
+  AV1_COMMON *const cm = &cpi->common;
+  const int base_qindex = cm->quant_params.base_qindex;
+  if (base_qindex == MINQ || base_qindex == MAXQ) return base_qindex;
+
+  const int num_mi_w = mi_size_wide[bsize];
+  const int num_mi_h = mi_size_high[bsize];
+  const int num_cols = (mi_params->mi_cols + num_mi_w - 1) / num_mi_w;
+  const int index = (mi_row / num_mi_h) * num_cols + (mi_col / num_mi_w);
+  const int delta_q = cpi->mb_delta_q[index];
+
+  int qindex = base_qindex + delta_q;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ + 1);
+
+  return qindex;
+}

diff --git a/av1/encoder/allintra_vis.h b/av1/encoder/allintra_vis.h
new file mode 100644
index 0000000..6f60cdb
--- /dev/null
+++ b/av1/encoder/allintra_vis.h

@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+#define AOM_AV1_ENCODER_ALLINTRA_VIS_H_
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+
+void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_wiener_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_perceptual_ai(AV1_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col);
+
+// User rating based mode
+void av1_init_mb_ur_var_buffer(AV1_COMP *cpi);
+
+void av1_set_mb_ur_variance(AV1_COMP *cpi);
+
+int av1_get_sbq_user_rating_based(AV1_COMP *const cpi, int mi_row, int mi_col);
+
+#endif  // AOM_AV1_ENCODER_ALLINTRA_VIS_H_

diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 3ea5f63..37bc309 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c

@@ -18,7 +18,6 @@
 #include "av1/common/seg_common.h"
 #include "av1/encoder/segmentation.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/system_state.h"
 
 #define AQ_C_SEGMENTS 5
 #define DEFAULT_AQ2_SEG 3  // Neutral Q segment
@@ -47,11 +46,11 @@
 
 static bool is_frame_aq_enabled(const AV1_COMP *const cpi) {
   const AV1_COMMON *const cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   return frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-         refresh_frame_flags->alt_ref_frame ||
-         (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref);
+         refresh_frame->alt_ref_frame ||
+         (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
 // Segmentation only makes sense if the target bits per SB is above a threshold.
@@ -69,7 +68,6 @@
                          cm->height != cm->prev_frame->height);
 
   // Make SURE use of floating point in this function is safe.
-  aom_clear_system_state();
 
   if (resolution_change) {
     memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
@@ -81,7 +79,7 @@
   if (is_frame_aq_enabled(cpi)) {
     int segment;
     const int aq_strength =
-        get_aq_c_strength(base_qindex, cm->seq_params.bit_depth);
+        get_aq_c_strength(base_qindex, cm->seq_params->bit_depth);
 
     // Clear down the segment map.
     memset(cpi->enc_seg.map, DEFAULT_AQ2_SEG,
@@ -108,7 +106,7 @@
       qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cpi->is_screen_content_type,
-          cm->seq_params.bit_depth);
+          cm->seq_params->bit_depth);
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -150,18 +148,17 @@
     // It is converted to bits << AV1_PROB_COST_SHIFT units.
     const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
                         << AV1_PROB_COST_SHIFT;
-    const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+    const int denom = cm->seq_params->mib_size * cm->seq_params->mib_size;
     const int target_rate = (int)(num / denom);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->quant_params.base_qindex,
-                                              cm->seq_params.bit_depth);
+                                              cm->seq_params->bit_depth);
 
-    aom_clear_system_state();
-    low_var_thresh =
-        (is_stat_consumption_stage_twopass(cpi))
-            ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
-            : DEFAULT_LV_THRESH;
+    low_var_thresh = (is_stat_consumption_stage_twopass(cpi))
+                         ? AOMMAX(exp(cpi->twopass_frame.mb_av_energy),
+                                  MIN_DEFAULT_LV_THRESH)
+                         : DEFAULT_LV_THRESH;
 
     av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes, bs);
     logvar = av1_log_block_var(cpi, mb, bs);

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index c7abe43..452a66f 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c

@@ -12,16 +12,15 @@
 #include <limits.h>
 #include <math.h>
 
+#include "av1/common/pred_common.h"
 #include "av1/common/seg_common.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 #include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/system_state.h"
 
 CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
-  size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
   if (cr == NULL) return NULL;
 
@@ -30,21 +29,12 @@
     av1_cyclic_refresh_free(cr);
     return NULL;
   }
-  last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
-  cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
-  if (cr->last_coded_q_map == NULL) {
-    av1_cyclic_refresh_free(cr);
-    return NULL;
-  }
-  assert(MAXQ <= 255);
-  memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
   return cr;
 }
 
 void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
   if (cr != NULL) {
     aom_free(cr->map);
-    aom_free(cr->last_coded_q_map);
     aom_free(cr);
   }
 }
@@ -57,19 +47,21 @@
                                 const MB_MODE_INFO *mbmi, int64_t rate,
                                 int64_t dist, int bsize) {
   MV mv = mbmi->mv[0].as_mv;
-  // Reject the block for lower-qp coding if projected distortion
-  // is above the threshold, and any of the following is true:
+  int is_compound = has_second_ref(mbmi);
+  // Reject the block for lower-qp coding for non-compound mode if
+  // projected distortion is above the threshold, and any of the following
+  // is true:
   // 1) mode uses large mv
   // 2) mode is an intra-mode
   // Otherwise accept for refresh.
-  if (dist > cr->thresh_dist_sb &&
+  if (!is_compound && dist > cr->thresh_dist_sb &&
       (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
        mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
        !is_inter_block(mbmi)))
     return CR_SEGMENT_ID_BASE;
-  else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
-           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
-           cr->rate_boost_fac > 10)
+  else if (is_compound || (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+                           is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+                           cr->rate_boost_fac > 10))
     // More aggressive delta-q for bigger blocks with zero motion.
     return CR_SEGMENT_ID_BOOST2;
   else
@@ -82,7 +74,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   int deltaq = av1_compute_qdelta_by_rate(
       rc, cpi->common.current_frame.frame_type, q, rate_factor,
-      cpi->is_screen_content_type, cpi->common.seq_params.bit_depth);
+      cpi->is_screen_content_type, cpi->common.seq_params->bit_depth);
   if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
     deltaq = -cr->max_qdelta_perc * q / 100;
   }
@@ -94,7 +86,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   const FRAME_TYPE frame_type = cm->current_frame.frame_type;
   const int base_qindex = cm->quant_params.base_qindex;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int mbs = cm->mi_params.MBs;
   const int num4x4bl = mbs << 4;
@@ -138,15 +130,52 @@
   bits_per_mb =
       (int)((1.0 - weight_segment) *
                 av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
-                                   correction_factor, cm->seq_params.bit_depth,
+                                   correction_factor, cm->seq_params->bit_depth,
                                    cpi->is_screen_content_type) +
             weight_segment * av1_rc_bits_per_mb(cm->current_frame.frame_type,
                                                 i + deltaq, correction_factor,
-                                                cm->seq_params.bit_depth,
+                                                cm->seq_params->bit_depth,
                                                 cpi->is_screen_content_type));
   return bits_per_mb;
 }
 
+void av1_cyclic_reset_segment_skip(const AV1_COMP *cpi, MACROBLOCK *const x,
+                                   int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  int cdf_num;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
+  const int prev_segment_id = mbmi->segment_id;
+  mbmi->segment_id = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
+  if (prev_segment_id != mbmi->segment_id) {
+    CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+    const int bw = mi_size_wide[bsize];
+    const int bh = mi_size_high[bsize];
+    const int xmis = AOMMIN(cm->mi_params.mi_cols - mi_col, bw);
+    const int ymis = AOMMIN(cm->mi_params.mi_rows - mi_row, bh);
+    const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
+    for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+      for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
+        const int map_offset =
+            block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+        cr->map[map_offset] = 0;
+        cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+        cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
+      }
+    }
+    if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks -= xmis * ymis;
+    else if (cyclic_refresh_segment_id(prev_segment_id) == CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks -= xmis * ymis;
+    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
+             CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
+  }
+}
+
 void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, MACROBLOCK *const x,
                                        int mi_row, int mi_col, BLOCK_SIZE bsize,
                                        int64_t rate, int64_t dist, int skip,
@@ -162,6 +191,7 @@
   const int block_index = mi_row * cm->mi_params.mi_cols + mi_col;
   const int refresh_this_block =
       candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+  int sh = cpi->cyclic_refresh->skip_over4x4 ? 2 : 1;
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
 
@@ -191,22 +221,21 @@
 
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
-  // 8x8 is smallest coding block size for non-key frames.
-  const int sh = bw << 1;
-  for (int mi_y = 0; mi_y < ymis; mi_y += 2) {
-    for (int mi_x = 0; mi_x < xmis; mi_x += 2) {
-      int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
+  for (int mi_y = 0; mi_y < ymis; mi_y += sh) {
+    for (int mi_x = 0; mi_x < xmis; mi_x += sh) {
+      const int map_offset = block_index + mi_y * cm->mi_params.mi_cols + mi_x;
       cr->map[map_offset] = new_map_value;
       cpi->enc_seg.map[map_offset] = mbmi->segment_id;
+      cm->cur_frame->seg_map[map_offset] = mbmi->segment_id;
     }
-    // Accumulate cyclic refresh update counters.
-    if (!dry_run && !frame_is_intra_only(cm)) {
-      if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
-        x->actual_num_seg1_blocks += sh;
-      else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
-               CR_SEGMENT_ID_BOOST2)
-        x->actual_num_seg2_blocks += sh;
-    }
+  }
+  // Accumulate cyclic refresh update counters.
+  if (!dry_run) {
+    if (cyclic_refresh_segment_id(mbmi->segment_id) == CR_SEGMENT_ID_BOOST1)
+      x->actual_num_seg1_blocks += xmis * ymis;
+    else if (cyclic_refresh_segment_id(mbmi->segment_id) ==
+             CR_SEGMENT_ID_BOOST2)
+      x->actual_num_seg2_blocks += xmis * ymis;
   }
 }
 
@@ -234,15 +263,15 @@
   const int avg_cnt_zeromv =
       100 * cr->cnt_zeromv / (mi_params->mi_rows * mi_params->mi_cols);
 
-  if (!cpi->use_svc ||
-      (cpi->use_svc &&
+  if (!cpi->ppi->use_svc ||
+      (cpi->ppi->use_svc &&
        !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
     rc->avg_frame_low_motion =
         (3 * rc->avg_frame_low_motion + avg_cnt_zeromv) / 4;
     // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
     // to all lower spatial layers.
-    if (cpi->use_svc &&
+    if (cpi->ppi->use_svc &&
         svc->spatial_layer_id == svc->number_spatial_layers - 1) {
       for (int i = 0; i < svc->number_spatial_layers - 1; ++i) {
         const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
@@ -257,15 +286,16 @@
 
 void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   // Set minimum gf_interval for GF update to a multiple of the refresh period,
   // with some max limit. Depending on past encoding stats, GF flag may be
   // reset and update may not occur until next baseline_gf_interval.
   if (cr->percent_refresh > 0)
-    rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
+    p_rc->baseline_gf_interval = AOMMIN(2 * (100 / cr->percent_refresh), 40);
   else
-    rc->baseline_gf_interval = 20;
-  if (rc->avg_frame_low_motion < 40) rc->baseline_gf_interval = 8;
+    p_rc->baseline_gf_interval = 20;
+  if (rc->avg_frame_low_motion < 40) p_rc->baseline_gf_interval = 8;
 }
 
 // Update the segmentation map, and related quantities: cyclic refresh map,
@@ -282,10 +312,10 @@
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, mi_params->mi_rows * mi_params->mi_cols);
-  sb_cols = (mi_params->mi_cols + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
-  sb_rows = (mi_params->mi_rows + cm->seq_params.mib_size - 1) /
-            cm->seq_params.mib_size;
+  sb_cols = (mi_params->mi_cols + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
+  sb_rows = (mi_params->mi_rows + cm->seq_params->mib_size - 1) /
+            cm->seq_params->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count =
@@ -302,21 +332,14 @@
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * cm->seq_params.mib_size;
-    int mi_col = sb_col_index * cm->seq_params.mib_size;
-    // TODO(any): Ensure the population of
-    // cpi->common.features.allow_screen_content_tools and use the same instead
-    // of cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-    int qindex_thresh = cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN
-                            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2,
-                                             cm->quant_params.base_qindex)
-                            : 0;
+    int mi_row = sb_row_index * cm->seq_params->mib_size;
+    int mi_col = sb_col_index * cm->seq_params->mib_size;
     assert(mi_row >= 0 && mi_row < mi_params->mi_rows);
     assert(mi_col >= 0 && mi_col < mi_params->mi_cols);
     bl_index = mi_row * mi_params->mi_cols + mi_col;
     // Loop through all MI blocks in superblock and update map.
-    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params.mib_size);
-    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params.mib_size);
+    xmis = AOMMIN(mi_params->mi_cols - mi_col, cm->seq_params->mib_size);
+    ymis = AOMMIN(mi_params->mi_rows - mi_row, cm->seq_params->mib_size);
     // cr_map only needed at 8x8 blocks.
     for (y = 0; y < ymis; y += 2) {
       for (x = 0; x < xmis; x += 2) {
@@ -325,7 +348,7 @@
         // for possible boost/refresh (segment 1). The segment id may get
         // reset to 0 later if block gets coded anything other than GLOBALMV.
         if (cr->map[bl_index2] == 0) {
-          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map += 4;
+          sum_map += 4;
         } else if (cr->map[bl_index2] < 0) {
           cr->map[bl_index2]++;
         }
@@ -352,6 +375,7 @@
 void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
   // TODO(marpan): Parameters need to be tuned.
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   int num4x4bl = cm->mi_params.MBs << 4;
@@ -360,17 +384,29 @@
   double weight_segment = 0;
   int qp_thresh = AOMMIN(20, rc->best_quality << 1);
   int qp_max_thresh = 118 * MAXQ >> 7;
+  // Although this segment feature for RTC is only used for
+  // blocks >= 8X8, for more efficient coding of the seg map
+  // cur_frame->seg_map needs to set at 4x4 along with the
+  // function av1_cyclic_reset_segment_skip(). Skipping over
+  // 4x4 will therefore have small bdrate loss (~0.2%), so
+  // we use it only for speed > 9 for now.
+  // Also if loop-filter deltas is applied via segment, then
+  // we need to set cr->skip_over4x4 = 1.
+  cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0;
   cr->apply_cyclic_refresh = 1;
   if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf.rc_cfg) ||
       cpi->svc.temporal_layer_id > 0 ||
-      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      p_rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (cpi->svc.number_spatial_layers > 1 &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
       (rc->frames_since_key > 20 &&
-       rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
+       p_rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh) ||
       (rc->avg_frame_low_motion < 45 && rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
   cr->percent_refresh = 10;
+  if (cpi->svc.number_temporal_layers > 2) cr->percent_refresh = 15;
   cr->max_qdelta_perc = 60;
   cr->time_for_refresh = 0;
   cr->motion_thresh = 32;
@@ -379,7 +415,8 @@
   // periods of the refresh cycle, after a key frame.
   // Account for larger interval on base layer for temporal layers.
   if (cr->percent_refresh > 0 &&
-      rc->frames_since_key < 400 / cr->percent_refresh) {
+      rc->frames_since_key <
+          (4 * cpi->svc.number_temporal_layers) * (100 / cr->percent_refresh)) {
     cr->rate_ratio_qdelta = 3.0;
   } else {
     cr->rate_ratio_qdelta = 2.0;
@@ -438,16 +475,12 @@
     memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_disable_segmentation(&cm->seg);
     if (cm->current_frame.frame_type == KEY_FRAME) {
-      memset(cr->last_coded_q_map, MAXQ,
-             cm->mi_params.mi_rows * cm->mi_params.mi_cols *
-                 sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
     }
     return;
   } else {
     const double q = av1_convert_qindex_to_q(cm->quant_params.base_qindex,
-                                             cm->seq_params.bit_depth);
-    aom_clear_system_state();
+                                             cm->seq_params->bit_depth);
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
     cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;

diff --git a/av1/encoder/aq_cyclicrefresh.h b/av1/encoder/aq_cyclicrefresh.h
index 97bd6f2..4e4e1f2 100644
--- a/av1/encoder/aq_cyclicrefresh.h
+++ b/av1/encoder/aq_cyclicrefresh.h

@@ -80,10 +80,6 @@
    */
   int8_t *map;
   /*!
-   * Map of the last q a block was coded at.
-   */
-  uint8_t *last_coded_q_map;
-  /*!
    * Threshold applied to the projected rate of the coding block,
    * when deciding whether block should be refreshed.
    */
@@ -111,6 +107,7 @@
   int qindex_delta[3];
   double weight_segment;
   int apply_cyclic_refresh;
+  int skip_over4x4;
   /*!\endcond */
 };
 
@@ -161,6 +158,30 @@
 int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
                                       double correction_factor);
 
+/*!\brief Update segment_id for blocks are skipped.
+ *
+ * After encoding a given prediction block, of size bsize at (mi_row, mi_col),
+ * check if we should reset the segment_id based on skip_txfm,
+ * and update the cyclic_refresh map and segmentation counters.
+ *
+ * \ingroup cyclic_refresh
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]   cpi       Top level encoder structure
+ * \param[in]   x         Pointer to MACROBLOCK structure
+ * \param[in]   mi_row    Row coordinate of the block in a step size of MI_SIZE
+ * \param[in]   mi_col    Col coordinate of the block in a step size of MI_SIZE
+ * \param[in]   bsize     Block size
+ *
+ * \return Update the \c mbmi->segment_id, the \c cpi->cyclic_refresh and
+ * the \c cm->cpi->enc_seg.map.
+ */
+
+void av1_cyclic_reset_segment_skip(const struct AV1_COMP *cpi,
+                                   MACROBLOCK *const x, int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize);
+
 /*!\brief Update segment_id for block based on mode selected.
  *
  * Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),

diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index 92d7ad1..05dd33a 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c

@@ -20,7 +20,6 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/dwt.h"
-#include "aom_ports/system_state.h"
 
 static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
                                                  0.9, .8,  .7,  .6 };
@@ -44,7 +43,7 @@
 
 void av1_vaq_frame_setup(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int base_qindex = cm->quant_params.base_qindex;
   struct segmentation *seg = &cm->seg;
   int i;
@@ -52,7 +51,7 @@
   int resolution_change =
       cm->prev_frame && (cm->width != cm->prev_frame->width ||
                          cm->height != cm->prev_frame->height);
-  int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+  int avg_energy = (int)(cpi->twopass_frame.mb_av_energy - 2);
   double avg_ratio;
   if (avg_energy > 7) avg_energy = 7;
   if (avg_energy < 0) avg_energy = 0;
@@ -61,27 +60,24 @@
   if (resolution_change) {
     memset(cpi->enc_seg.map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols);
     av1_clearall_segfeatures(seg);
-    aom_clear_system_state();
     av1_disable_segmentation(seg);
     return;
   }
   if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      refresh_frame_flags->alt_ref_frame ||
-      (refresh_frame_flags->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+      refresh_frame->alt_ref_frame ||
+      (refresh_frame->golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     cpi->vaq_refresh = 1;
 
     av1_enable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
-    aom_clear_system_state();
-
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       // Set up avg segment id to be 1.0 and adjust the other segments around
       // it.
       int qindex_delta = av1_compute_qdelta_by_rate(
           &cpi->rc, cm->current_frame.frame_type, base_qindex,
           rate_ratio[i] / avg_ratio, cpi->is_screen_content_type,
-          cm->seq_params.bit_depth);
+          cm->seq_params->bit_depth);
 
       // We don't allow qindex 0 in a segment if the base value is not 0.
       // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
@@ -120,20 +116,18 @@
   const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
   const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
 
-  aom_clear_system_state();
-
   for (i = 0; i < bh; i += 4) {
     for (j = 0; j < bw; j += 4) {
       if (is_cur_buf_hbd(xd)) {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride,
                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
                           16);
       } else {
         var +=
-            log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
                           x->plane[0].src.buf + i * x->plane[0].src.stride + j,
                           x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
                           16);
@@ -144,41 +138,63 @@
   var /= (bw / 4 * bh / 4);
   if (var > 7) var = 7;
 
-  aom_clear_system_state();
   return (int)(var);
 }
 
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col) {
+  // This functions returns the block average of luma block
+  unsigned int sum, avg, num_pix;
+  int r, c;
+  const int pic_w = cpi->common.width;
+  const int pic_h = cpi->common.height;
+  const int bw = MI_SIZE * mi_size_wide[bs];
+  const int bh = MI_SIZE * mi_size_high[bs];
+  const uint16_t *x16 = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+  sum = 0;
+  num_pix = 0;
+  avg = 0;
+  int row = mi_row << MI_SIZE_LOG2;
+  int col = mi_col << MI_SIZE_LOG2;
+  for (r = row; (r < (row + bh)) && (r < pic_h); r++) {
+    for (c = col; (c < (col + bw)) && (c < pic_w); c++) {
+      sum += *(x16 + r * x->plane[0].src.stride + c);
+      num_pix++;
+    }
+  }
+  if (num_pix != 0) {
+    avg = sum / num_pix;
+  }
+  return avg;
+}
+
 #define DEFAULT_E_MIDPOINT 10.0
 
 static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   MACROBLOCKD *xd = &x->e_mbd;
   int stride = x->plane[0].src.stride;
   uint8_t *buf = x->plane[0].src.buf;
-  const int bw = MI_SIZE * mi_size_wide[bs];
-  const int bh = MI_SIZE * mi_size_high[bs];
+  const int num_8x8_cols = block_size_wide[bs] / 8;
+  const int num_8x8_rows = block_size_high[bs] / 8;
   const int hbd = is_cur_buf_hbd(xd);
 
-  int var = 0;
-  for (int r = 0; r < bh; r += 8)
-    for (int c = 0; c < bw; c += 8) {
-      var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
-    }
+  int64_t var = av1_haar_ac_sad_mxn_uint8_input(buf, stride, hbd, num_8x8_rows,
+                                                num_8x8_cols);
 
   return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
 }
 
 double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
   unsigned int haar_sad = haar_ac_energy(x, bs);
-  aom_clear_system_state();
   return log(haar_sad + 1.0);
 }
 
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bs) {
   double energy, energy_midpoint;
-  aom_clear_system_state();
   energy_midpoint = (is_stat_consumption_stage_twopass(cpi))
-                        ? cpi->twopass.frame_avg_haar_energy
+                        ? cpi->twopass_frame.frame_avg_haar_energy
                         : DEFAULT_E_MIDPOINT;
   energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
@@ -199,7 +215,7 @@
   int qindex_delta = av1_compute_qdelta_by_rate(
       &cpi->rc, cm->current_frame.frame_type, base_qindex,
       deltaq_rate_ratio[rate_level], cpi->is_screen_content_type,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
 
   if ((base_qindex != 0) && ((base_qindex + qindex_delta) == 0)) {
     qindex_delta = -base_qindex + 1;

diff --git a/av1/encoder/aq_variance.h b/av1/encoder/aq_variance.h
index 543eb0b..aa0535a 100644
--- a/av1/encoder/aq_variance.h
+++ b/av1/encoder/aq_variance.h

@@ -21,6 +21,8 @@
 void av1_vaq_frame_setup(AV1_COMP *cpi);
 
 int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_log_block_avg(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                      int mi_row, int mi_col);
 int av1_compute_q_from_energy_level_deltaq_mode(const AV1_COMP *const cpi,
                                                 int block_var_level);
 int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,

diff --git a/av1/encoder/arm/neon/av1_error_neon.c b/av1/encoder/arm/neon/av1_error_neon.c
index 22da1a8..124c1fd 100644
--- a/av1/encoder/arm/neon/av1_error_neon.c
+++ b/av1/encoder/arm/neon/av1_error_neon.c

@@ -11,8 +11,8 @@
 #include <arm_neon.h>
 #include <assert.h>
 
-#include "av1/common/arm/mem_neon.h"
 #include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/mem_neon.h"
 
 int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              intptr_t block_size, int64_t *ssz) {

diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index b9a314d..a90cfa8 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c

@@ -13,10 +13,10 @@
 #include <assert.h>
 
 #include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
-#include "av1/common/arm/mem_neon.h"
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 

diff --git a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
index f5d1f16..ad81f40 100644
--- a/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c
+++ b/av1/encoder/arm/neon/av1_temporal_denoiser_neon.c

@@ -13,8 +13,8 @@
 #include <assert.h>
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"
 

diff --git a/av1/encoder/arm/neon/encodetxb_neon.c b/av1/encoder/arm/neon/encodetxb_neon.c
index 4ededd8..9bb822a 100644
--- a/av1/encoder/arm/neon/encodetxb_neon.c
+++ b/av1/encoder/arm/neon/encodetxb_neon.c

@@ -13,9 +13,9 @@
 #include <assert.h>
 #include <math.h>
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/encodetxb.h"
-#include "av1/common/arm/mem_neon.h"
 
 void av1_txb_init_levels_neon(const tran_low_t *const coeff, const int width,
                               const int height, uint8_t *const levels) {

diff --git a/av1/encoder/arm/neon/quantize_neon.c b/av1/encoder/arm/neon/quantize_neon.c
index 4eadbbc..b0e8950 100644
--- a/av1/encoder/arm/neon/quantize_neon.c
+++ b/av1/encoder/arm/neon/quantize_neon.c

@@ -13,11 +13,11 @@
 
 #include <math.h>
 
+#include "aom_dsp/arm/mem_neon.h"
 #include "aom_mem/aom_mem.h"
 
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
-#include "av1/common/arm/mem_neon.h"
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"

diff --git a/av1/encoder/arm/neon/rdopt_neon.c b/av1/encoder/arm/neon/rdopt_neon.c
index 1786b27..25df6b4 100644
--- a/av1/encoder/arm/neon/rdopt_neon.c
+++ b/av1/encoder/arm/neon/rdopt_neon.c

@@ -13,8 +13,6 @@
 
 #include <arm_neon.h>
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/rdopt.h"
 #include "config/av1_rtcd.h"
 
@@ -433,8 +431,6 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
-  aom_clear_system_state();
-
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 

diff --git a/av1/encoder/av1_noise_estimate.c b/av1/encoder/av1_noise_estimate.c
index dbc86c5..4419085 100644
--- a/av1/encoder/av1_noise_estimate.c
+++ b/av1/encoder/av1_noise_estimate.c

@@ -27,8 +27,8 @@
 #if CONFIG_AV1_TEMPORAL_DENOISING
 // For SVC: only do noise estimation on top spatial layer.
 static INLINE int noise_est_svc(const struct AV1_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
 }
 #endif
@@ -53,15 +53,10 @@
 }
 
 static int enable_noise_estimation(AV1_COMP *const cpi) {
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (cpi->common.seq_params.use_highbitdepth) return 0;
+  if (cpi->common.seq_params->use_highbitdepth) return 0;
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_AV1_TEMPORAL_DENOISING
@@ -73,9 +68,9 @@
   // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original.
   // Not enabled for SVC mode and screen_content_mode.
   // Not enabled for low resolutions.
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+  if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
       cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
-      resize_pending == 0 && !cpi->use_svc &&
+      resize_pending == 0 && !cpi->ppi->use_svc &&
       cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN &&
       cpi->common.width * cpi->common.height >= 640 * 360)
     return 1;
@@ -184,9 +179,6 @@
     const int src_ystride = cpi->source->y_stride;
     const uint8_t *last_src_y = last_source->y_buffer;
     const int last_src_ystride = last_source->y_stride;
-    const uint8_t *src_u = cpi->source->u_buffer;
-    const uint8_t *src_v = cpi->source->v_buffer;
-    const int src_uvstride = cpi->source->uv_stride;
     int mi_row, mi_col;
     int num_low_motion = 0;
     int frame_low_motion = 1;
@@ -227,7 +219,7 @@
             unsigned int sse;
             // Compute variance between co-located blocks from current and
             // last input frames.
-            unsigned int variance = cpi->fn_ptr[bsize].vf(
+            unsigned int variance = cpi->ppi->fn_ptr[bsize].vf(
                 src_y, src_ystride, last_src_y, last_src_ystride, &sse);
             unsigned int hist_index = variance / bin_size;
             if (hist_index < MAX_VAR_HIST_BINS)
@@ -238,13 +230,9 @@
         }
         src_y += 4;
         last_src_y += 4;
-        src_u += 2;
-        src_v += 2;
       }
       src_y += (src_ystride << 2) - (mi_params->mi_cols << 2);
       last_src_y += (last_src_ystride << 2) - (mi_params->mi_cols << 2);
-      src_u += (src_uvstride << 1) - (mi_params->mi_cols << 1);
-      src_v += (src_uvstride << 1) - (mi_params->mi_cols << 1);
     }
     ne->last_w = cm->width;
     ne->last_h = cm->height;

diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 9d38e2d..105897e 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c

@@ -33,6 +33,40 @@
   *eob_ptr = 0;
 }
 
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr) {
+  memset(qcoeff_ptr, 0, coeff_count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, coeff_count * sizeof(*dqcoeff_ptr));
+  const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+                            ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+  int eob = 0;
+  for (int i = 0; i < coeff_count; i++) {
+    const int rc = scan[i];
+    const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32 = 0;
+    if ((abs_coeff << (1 + log_scale)) >= thresh) {
+      abs_coeff = clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+      tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
+      if (tmp32) {
+        qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+        const tran_low_t abs_dqcoeff =
+            (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+        dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+      }
+    }
+    if (tmp32) eob = i + 1;
+  }
+  return eob;
+}
+
 static void quantize_fp_helper_c(
     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
     const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -53,26 +87,9 @@
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (qm_ptr == NULL && iqm_ptr == NULL) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = AOMSIGN(coeff);
-      int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      int tmp32 = 0;
-      if ((abs_coeff << (1 + log_scale)) >= thresh) {
-        abs_coeff =
-            clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
-        tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
-        if (tmp32) {
-          qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-          const tran_low_t abs_dqcoeff =
-              (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
-          dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
-        }
-      }
-      if (tmp32) eob = i;
-    }
+    *eob_ptr = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                          log_scale, scan, (int)n_coeffs,
+                                          coeff_ptr, qcoeff_ptr, dqcoeff_ptr);
   } else {
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
@@ -100,8 +117,8 @@
 
       if (tmp32) eob = i;
     }
+    *eob_ptr = eob + 1;
   }
-  *eob_ptr = eob + 1;
 }
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -196,7 +213,8 @@
                        const int16_t *round_ptr, const int16_t *quant_ptr,
                        int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan) {
+                       const int16_t *scan, const int16_t *iscan) {
+  (void)iscan;
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
@@ -660,14 +678,67 @@
                       quants, dequants);
 }
 
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x) {
+  const QUANTS *const quants = &enc_quant_dequant_params->quants;
+  const Dequants *const dequants = &enc_quant_dequant_params->dequants;
+  x->qindex = qindex;
+  x->seg_skip_block =
+      0;  // TODO(angiebird): Find a proper place to init this variable.
+
+  // Y
+  x->plane[0].quant_QTX = quants->y_quant[qindex];
+  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+  x->plane[0].round_QTX = quants->y_round[qindex];
+  x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
+
+  // U
+  x->plane[1].quant_QTX = quants->u_quant[qindex];
+  x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+  x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+  x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+  x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+  x->plane[1].round_QTX = quants->u_round[qindex];
+  x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
+
+  // V
+  x->plane[2].quant_QTX = quants->v_quant[qindex];
+  x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+  x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+  x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+  x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+  x->plane[2].round_QTX = quants->v_round[qindex];
+  x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
+}
+
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd) {
+  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+  const int qmlevel_y =
+      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
+  const int qmlevel_u =
+      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
+  const int qmlevel_v =
+      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
+  const int qmlevel_ls[MAX_MB_PLANE] = { qmlevel_y, qmlevel_u, qmlevel_v };
+  for (int i = 0; i < MAX_MB_PLANE; ++i) {
+    const int qmlevel = qmlevel_ls[i];
+    memcpy(&xd->plane[i].seg_qmatrix[segment_id],
+           quant_params->gqmatrix[qmlevel][i],
+           sizeof(quant_params->gqmatrix[qmlevel][i]));
+    memcpy(&xd->plane[i].seg_iqmatrix[segment_id],
+           quant_params->giqmatrix[qmlevel][i],
+           sizeof(quant_params->giqmatrix[qmlevel][i]));
+  }
+}
+
 void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
                                int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonQuantParams *const quant_params = &cm->quant_params;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const QUANTS *const quants = &cpi->enc_quant_dequant_params.quants;
-  const Dequants *const dequants = &cpi->enc_quant_dequant_params.dequants;
-
   const int current_qindex = AOMMAX(
       0,
       AOMMIN(QINDEX_RANGE - 1, cm->delta_q_info.delta_q_present_flag
@@ -676,59 +747,12 @@
   const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
   const int rdmult =
       av1_compute_rd_mult(cpi, qindex + quant_params->y_dc_delta_q);
-  const int use_qmatrix = av1_use_qmatrix(quant_params, xd, segment_id);
+  av1_set_q_index(&cpi->enc_quant_dequant_params, qindex, x);
 
-  // Y
-  const int qmlevel_y =
-      use_qmatrix ? quant_params->qmatrix_level_y : NUM_QM_LEVELS - 1;
-  x->plane[0].quant_QTX = quants->y_quant[qindex];
-  x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
-  x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
-  x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
-  x->plane[0].zbin_QTX = quants->y_zbin[qindex];
-  x->plane[0].round_QTX = quants->y_round[qindex];
-  x->plane[0].dequant_QTX = dequants->y_dequant_QTX[qindex];
-  memcpy(&xd->plane[0].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_y][0],
-         sizeof(quant_params->gqmatrix[qmlevel_y][0]));
-  memcpy(&xd->plane[0].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_y][0],
-         sizeof(quant_params->giqmatrix[qmlevel_y][0]));
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_set_qmatrix(quant_params, segment_id, xd);
 
-  // U
-  const int qmlevel_u =
-      use_qmatrix ? quant_params->qmatrix_level_u : NUM_QM_LEVELS - 1;
-  x->plane[1].quant_QTX = quants->u_quant[qindex];
-  x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
-  x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
-  x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
-  x->plane[1].zbin_QTX = quants->u_zbin[qindex];
-  x->plane[1].round_QTX = quants->u_round[qindex];
-  x->plane[1].dequant_QTX = dequants->u_dequant_QTX[qindex];
-  memcpy(&xd->plane[1].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_u][1],
-         sizeof(quant_params->gqmatrix[qmlevel_u][1]));
-  memcpy(&xd->plane[1].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_u][1],
-         sizeof(quant_params->giqmatrix[qmlevel_u][1]));
-  // V
-  const int qmlevel_v =
-      use_qmatrix ? quant_params->qmatrix_level_v : NUM_QM_LEVELS - 1;
-  x->plane[2].quant_QTX = quants->v_quant[qindex];
-  x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
-  x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
-  x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
-  x->plane[2].zbin_QTX = quants->v_zbin[qindex];
-  x->plane[2].round_QTX = quants->v_round[qindex];
-  x->plane[2].dequant_QTX = dequants->v_dequant_QTX[qindex];
-  memcpy(&xd->plane[2].seg_qmatrix[segment_id],
-         quant_params->gqmatrix[qmlevel_v][2],
-         sizeof(quant_params->gqmatrix[qmlevel_v][2]));
-  memcpy(&xd->plane[2].seg_iqmatrix[segment_id],
-         quant_params->giqmatrix[qmlevel_v][2],
-         sizeof(quant_params->giqmatrix[qmlevel_v][2]));
   x->seg_skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  x->qindex = qindex;
 
   av1_set_error_per_bit(&x->errorperbit, rdmult);
   av1_set_sad_per_bit(cpi, &x->sadperbit, qindex);
@@ -740,14 +764,34 @@
   av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
 }
 
+static int adjust_hdr_cb_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcbQP = CHROMA_CB_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCb = (int)(dcbQP + (dcbQP < 0 ? -0.5 : 0.5));
+  dqpCb = AOMMIN(0, dqpCb);
+  dqpCb = (int)CLIP(dqpCb, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCb;
+}
+
+static int adjust_hdr_cr_deltaq(int base_qindex) {
+  double baseQp = base_qindex / QP_SCALE_FACTOR;
+  const double chromaQp = CHROMA_QP_SCALE * baseQp + CHROMA_QP_OFFSET;
+  const double dcrQP = CHROMA_CR_QP_SCALE * chromaQp * QP_SCALE_FACTOR;
+  int dqpCr = (int)(dcrQP + (dcrQP < 0 ? -0.5 : 0.5));
+  dqpCr = AOMMIN(0, dqpCr);
+  dqpCr = (int)CLIP(dqpCr, -12 * QP_SCALE_FACTOR, 12 * QP_SCALE_FACTOR);
+  return dqpCr;
+}
+
 void av1_set_quantizer(AV1_COMMON *const cm, int min_qmlevel, int max_qmlevel,
-                       int q, int enable_chroma_deltaq) {
+                       int q, int enable_chroma_deltaq, int enable_hdr_deltaq) {
   // quantizer has to be reinitialized with av1_init_quantizer() if any
   // delta_q changes.
   CommonQuantParams *quant_params = &cm->quant_params;
   quant_params->base_qindex = AOMMAX(cm->delta_q_info.delta_q_present_flag, q);
-
   quant_params->y_dc_delta_q = 0;
+
   if (enable_chroma_deltaq) {
     // TODO(aomedia:2717): need to design better delta
     quant_params->u_dc_delta_q = 2;
@@ -761,13 +805,25 @@
     quant_params->v_ac_delta_q = 0;
   }
 
+  // following section 8.3.2 in T-REC-H.Sup15 document
+  // to apply to AV1 qindex in the range of [0, 255]
+  if (enable_hdr_deltaq) {
+    int dqpCb = adjust_hdr_cb_deltaq(quant_params->base_qindex);
+    int dqpCr = adjust_hdr_cr_deltaq(quant_params->base_qindex);
+    quant_params->u_dc_delta_q = quant_params->u_ac_delta_q = dqpCb;
+    quant_params->v_dc_delta_q = quant_params->v_ac_delta_q = dqpCr;
+    if (dqpCb != dqpCr) {
+      cm->seq_params->separate_uv_delta_q = 1;
+    }
+  }
+
   quant_params->qmatrix_level_y =
       aom_get_qmlevel(quant_params->base_qindex, min_qmlevel, max_qmlevel);
   quant_params->qmatrix_level_u =
       aom_get_qmlevel(quant_params->base_qindex + quant_params->u_ac_delta_q,
                       min_qmlevel, max_qmlevel);
 
-  if (!cm->seq_params.separate_uv_delta_q)
+  if (!cm->seq_params->separate_uv_delta_q)
     quant_params->qmatrix_level_v = quant_params->qmatrix_level_u;
   else
     quant_params->qmatrix_level_v =

diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index ad96197..085ab17 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h

@@ -22,9 +22,6 @@
 extern "C" {
 #endif
 
-#define EOB_FACTOR 325
-#define SKIP_EOB_FACTOR_ADJUST 200
-
 typedef struct QUANT_PARAM {
   int log_scale;
   TX_SIZE tx_size;
@@ -109,7 +106,8 @@
                         aom_bit_depth_t bit_depth);
 
 void av1_set_quantizer(struct AV1Common *const cm, int min_qmlevel,
-                       int max_qmlevel, int q, int enable_chroma_deltaq);
+                       int max_qmlevel, int q, int enable_chroma_deltaq,
+                       int enable_hdr_deltaq);
 
 int av1_quantizer_to_qindex(int quantizer);
 
@@ -118,6 +116,32 @@
 void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
 
+/*!\brief Quantize transform coefficients without using qmatrix
+ *
+ * quant_ptr, dequant_ptr and round_ptr are size 2 arrays,
+ * where index 0 corresponds to dc coeff and index 1 corresponds to ac coeffs.
+ *
+ * \param[in]  quant_ptr    16-bit fixed point representation of inverse
+ *                          quantize step size, i.e. 2^16/dequant
+ * \param[in]  dequant_ptr  quantize step size
+ * \param[in]  round_ptr    rounding
+ * \param[in]  log_scale    the relative log scale of the transform
+ *                          coefficients
+ * \param[in]  scan         scan[i] indicates the position of ith to-be-coded
+ *                          coefficient
+ * \param[in]  coeff_count  number of coefficients
+ * \param[out] qcoeff_ptr   quantized coefficients
+ * \param[out] dqcoeff_ptr  dequantized coefficients
+ *
+ * \return The last non-zero coefficient's scan index plus 1
+ */
+int av1_quantize_fp_no_qmatrix(const int16_t quant_ptr[2],
+                               const int16_t dequant_ptr[2],
+                               const int16_t round_ptr[2], int log_scale,
+                               const int16_t *scan, int coeff_count,
+                               const tran_low_t *coeff_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr);
+
 void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
@@ -133,6 +157,29 @@
                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                             const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
 
+/*!\brief Update quantize parameters in MACROBLOCK
+ *
+ * \param[in]  enc_quant_dequant_params This parameter cached the quantize and
+ *                                      dequantize parameters for all q
+ *                                      indices.
+ * \param[in]  qindex                   Quantize index used for the current
+ *                                      superblock.
+ * \param[out] x                        A superblock data structure for
+ *                                      encoder.
+ */
+void av1_set_q_index(const EncQuantDequantParams *enc_quant_dequant_params,
+                     int qindex, MACROBLOCK *x);
+
+/*!\brief Update quantize matrix in MACROBLOCKD based on segment id
+ *
+ * \param[in]  quant_params  Quantize parameters used by encoder and decoder
+ * \param[in]  segment_id    Segment id.
+ * \param[out] xd            A superblock data structure used by encoder and
+ * decoder.
+ */
+void av1_set_qmatrix(const CommonQuantParams *quant_params, int segment_id,
+                     MACROBLOCKD *xd);
+
 #if CONFIG_AV1_HIGHBITDEPTH
 void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
                                    intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
@@ -154,6 +201,7 @@
                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
                                    const SCAN_ORDER *sc,
                                    const QUANT_PARAM *qparam);
+
 #endif
 
 #ifdef __cplusplus

diff --git a/av1/encoder/av1_temporal_denoiser.c b/av1/encoder/av1_temporal_denoiser.c
index 730711e..26e0eda7 100644
--- a/av1/encoder/av1_temporal_denoiser.c
+++ b/av1/encoder/av1_temporal_denoiser.c

@@ -349,7 +349,7 @@
         &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, &zeromv_filter, cpi->svc.number_spatial_layers,
         cpi->source->y_width, cpi->svc.ref_idx[0], cpi->svc.ref_idx[3],
-        cpi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
+        cpi->ppi->use_svc, cpi->svc.spatial_layer_id, use_gf_temporal_ref);
 
   if (decision == FILTER_BLOCK) {
     decision = av1_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -485,8 +485,8 @@
   if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
     fail = aom_alloc_frame_buffer(
         &denoiser->running_avg_y[fb_idx], cm->width, cm->height,
-        cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-        cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+        cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+        cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
         cm->features.byte_alignment);
     if (fail) {
       av1_denoiser_free(denoiser);
@@ -718,13 +718,9 @@
                                 ? KEY_FRAME
                                 : cm->current_frame.frame_type;
     cpi->denoiser.current_denoiser_frame++;
-    const int resize_pending =
-        (cpi->resize_pending_params.width &&
-         cpi->resize_pending_params.height &&
-         (cpi->common.width != cpi->resize_pending_params.width ||
-          cpi->common.height != cpi->resize_pending_params.height));
+    const int resize_pending = is_frame_resize_pending(cpi);
 
-    if (cpi->use_svc) {
+    if (cpi->ppi->use_svc) {
 // TODO(kyslov) Enable when SVC temporal denosing is implemented
 #if 0
       const int svc_buf_shift =
@@ -746,7 +742,7 @@
                                    cpi->refresh_golden_frame,
                                    cpi->refresh_last_frame, cpi->alt_fb_idx,
                                    cpi->gld_fb_idx, cpi->lst_fb_idx))
-        aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                            "Failed to re-allocate denoiser for SVC");
 #endif
     }

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 84b9633..2dca98f 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c

@@ -20,7 +20,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem_ops.h"
-#include "aom_ports/system_state.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_BITSTREAM_DEBUG
@@ -41,12 +40,15 @@
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
 
 #define ENC_MISMATCH_DEBUG 0
+#define SETUP_TIME_OH_CONST 5     // Setup time overhead constant per worker
+#define JOB_DISP_TIME_OH_CONST 1  // Job dispatch time overhead per tile
 
 static INLINE void write_uniform(aom_writer *w, int n, int v) {
   const int l = get_unsigned_bits(n);
@@ -314,14 +316,16 @@
 
 static AOM_INLINE void write_delta_lflevel(const AV1_COMMON *cm,
                                            const MACROBLOCKD *xd, int lf_id,
-                                           int delta_lflevel, aom_writer *w) {
+                                           int delta_lflevel,
+                                           int delta_lf_multi, aom_writer *w) {
   int sign = delta_lflevel < 0;
   int abs = sign ? -delta_lflevel : delta_lflevel;
   int rem_bits, thr;
   int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+  (void)cm;
 
-  if (cm->delta_q_info.delta_lf_multi) {
+  if (delta_lf_multi) {
     assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
                                                          : FRAME_LF_COUNT - 2));
     aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
@@ -448,7 +452,7 @@
   }
 }
 
-static AOM_INLINE void write_segment_id(AV1_COMP *cpi,
+static AOM_INLINE void write_segment_id(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                         const MB_MODE_INFO *const mbmi,
                                         aom_writer *w,
                                         const struct segmentation *seg,
@@ -457,7 +461,6 @@
   if (!seg->enabled || !seg->update_map) return;
 
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int cdf_num;
   const int pred = av1_get_spatial_seg_pred(cm, xd, &cdf_num);
   const int mi_row = xd->mi_row;
@@ -616,8 +619,8 @@
 }
 
 static AOM_INLINE void write_mb_interp_filter(AV1_COMMON *const cm,
-                                              const MACROBLOCKD *xd,
-                                              aom_writer *w) {
+                                              ThreadData *td, aom_writer *w) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
 
@@ -636,8 +639,8 @@
           av1_extract_interp_filter(mbmi->interp_filters, dir);
       aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
                        SWITCHABLE_FILTERS);
-      ++cm->cur_frame->interp_filter_selected[filter];
-      if (cm->seq_params.enable_dual_filter == 0) return;
+      ++td->interp_filter_selected[filter];
+      if (cm->seq_params->enable_dual_filter == 0) return;
     }
   }
 }
@@ -780,7 +783,7 @@
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_y(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 
@@ -795,7 +798,7 @@
       aom_write_symbol(w, n - PALETTE_MIN_SIZE,
                        xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
                        PALETTE_SIZES);
-      write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+      write_palette_colors_uv(xd, pmi, cm->seq_params->bit_depth, w);
     }
   }
 }
@@ -877,7 +880,7 @@
 
   // At the start of a superblock, mark that we haven't yet written CDEF
   // strengths for any of the CDEF units contained in this superblock.
-  const int sb_mask = (cm->seq_params.mib_size - 1);
+  const int sb_mask = (cm->seq_params->mib_size - 1);
   const int mi_row_in_sb = (xd->mi_row & sb_mask);
   const int mi_col_in_sb = (xd->mi_col & sb_mask);
   if (mi_row_in_sb == 0 && mi_col_in_sb == 0) {
@@ -892,7 +895,7 @@
   const int index_mask = cdef_size;
   const int cdef_unit_row_in_sb = ((xd->mi_row & index_mask) != 0);
   const int cdef_unit_col_in_sb = ((xd->mi_col & index_mask) != 0);
-  const int index = (cm->seq_params.sb_size == BLOCK_128X128)
+  const int index = (cm->seq_params->sb_size == BLOCK_128X128)
                         ? cdef_unit_col_in_sb + 2 * cdef_unit_row_in_sb
                         : 0;
 
@@ -912,9 +915,9 @@
 }
 
 static AOM_INLINE void write_inter_segment_id(
-    AV1_COMP *cpi, aom_writer *w, const struct segmentation *const seg,
-    struct segmentation_probs *const segp, int skip, int preskip) {
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+    AV1_COMP *cpi, MACROBLOCKD *const xd, aom_writer *w,
+    const struct segmentation *const seg, struct segmentation_probs *const segp,
+    int skip, int preskip) {
   MB_MODE_INFO *const mbmi = xd->mi[0];
   AV1_COMMON *const cm = &cpi->common;
   const int mi_row = xd->mi_row;
@@ -926,7 +929,7 @@
     } else {
       if (seg->segid_preskip) return;
       if (skip) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 1);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 1);
         if (seg->temporal_update) mbmi->seg_id_predicted = 0;
         return;
       }
@@ -936,35 +939,33 @@
       aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
       aom_write_symbol(w, pred_flag, pred_cdf, 2);
       if (!pred_flag) {
-        write_segment_id(cpi, mbmi, w, seg, segp, 0);
+        write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
       }
       if (pred_flag) {
         set_spatial_segment_id(&cm->mi_params, cm->cur_frame->seg_map,
                                mbmi->bsize, mi_row, mi_col, mbmi->segment_id);
       }
     } else {
-      write_segment_id(cpi, mbmi, w, seg, segp, 0);
+      write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
     }
   }
 }
 
 // If delta q is present, writes delta_q index.
 // Also writes delta_q loop filter levels, if present.
-static AOM_INLINE void write_delta_q_params(AV1_COMP *cpi, int skip,
+static AOM_INLINE void write_delta_q_params(AV1_COMMON *const cm,
+                                            MACROBLOCKD *const xd, int skip,
                                             aom_writer *w) {
-  AV1_COMMON *const cm = &cpi->common;
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
 
   if (delta_q_info->delta_q_present_flag) {
-    MACROBLOCK *const x = &cpi->td.mb;
-    MACROBLOCKD *const xd = &x->e_mbd;
     const MB_MODE_INFO *const mbmi = xd->mi[0];
     const BLOCK_SIZE bsize = mbmi->bsize;
     const int super_block_upper_left =
-        ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
 
-    if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+    if ((bsize != cm->seq_params->sb_size || skip == 0) &&
         super_block_upper_left) {
       assert(mbmi->current_qindex > 0);
       const int reduced_delta_qindex =
@@ -980,14 +981,14 @@
             int reduced_delta_lflevel =
                 (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
                 delta_q_info->delta_lf_res;
-            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+            write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, 1, w);
             xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
           }
         } else {
           int reduced_delta_lflevel =
               (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
               delta_q_info->delta_lf_res;
-          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+          write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, 0, w);
           xd->delta_lf_from_base = mbmi->delta_lf_from_base;
         }
       }
@@ -995,12 +996,10 @@
   }
 }
 
-static AOM_INLINE void write_intra_prediction_modes(AV1_COMP *cpi,
+static AOM_INLINE void write_intra_prediction_modes(const AV1_COMMON *cm,
+                                                    MACROBLOCKD *const xd,
                                                     int is_keyframe,
                                                     aom_writer *w) {
-  const AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
   const PREDICTION_MODE mode = mbmi->mode;
@@ -1023,7 +1022,7 @@
   }
 
   // UV mode and UV angle delta.
-  if (!cm->seq_params.monochrome && xd->is_chroma_ref) {
+  if (!cm->seq_params->monochrome && xd->is_chroma_ref) {
     const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
     write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
     if (uv_mode == UV_CFL_PRED)
@@ -1085,9 +1084,10 @@
                                x->mbmi_ext_frame);
 }
 
-static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void pack_inter_mode_mvs(AV1_COMP *cpi, ThreadData *const td,
+                                           aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
   const struct segmentation *const seg = &cm->seg;
@@ -1102,7 +1102,7 @@
   const int is_compound = has_second_ref(mbmi);
   int ref;
 
-  write_inter_segment_id(cpi, w, seg, segp, 0, 1);
+  write_inter_segment_id(cpi, xd, w, seg, segp, 0, 1);
 
   write_skip_mode(cm, xd, segment_id, mbmi, w);
 
@@ -1110,18 +1110,18 @@
   const int skip =
       mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
 
-  write_inter_segment_id(cpi, w, seg, segp, skip, 0);
+  write_inter_segment_id(cpi, xd, w, seg, segp, skip, 0);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
 
   if (mbmi->skip_mode) return;
 
   if (!is_inter) {
-    write_intra_prediction_modes(cpi, 0, w);
+    write_intra_prediction_modes(cm, xd, 0, w);
   } else {
     int16_t mode_ctx;
 
@@ -1149,21 +1149,23 @@
       for (ref = 0; ref < 1 + is_compound; ++ref) {
         nmv_context *nmvc = &ec_ctx->nmvc;
         const int_mv ref_mv = get_ref_mv(x, ref);
-        av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+        av1_encode_mv(cpi, w, td, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
                       allow_hp);
       }
     } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 1);
-      av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
       nmv_context *nmvc = &ec_ctx->nmvc;
       const int_mv ref_mv = get_ref_mv(x, 0);
-      av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+      av1_encode_mv(cpi, w, td, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc,
+                    allow_hp);
     }
 
     if (cpi->common.current_frame.reference_mode != COMPOUND_REFERENCE &&
-        cpi->common.seq_params.enable_interintra_compound &&
+        cpi->common.seq_params->enable_interintra_compound &&
         is_interintra_allowed(mbmi)) {
       const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
       const int bsize_group = size_group_lookup[bsize];
@@ -1190,7 +1192,7 @@
     // Group B (1): interintra, compound_diffwtd, wedge
     if (has_second_ref(mbmi)) {
       const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                       cm->seq_params.enable_masked_compound;
+                                       cm->seq_params->enable_masked_compound;
 
       if (masked_compound_used) {
         const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
@@ -1204,7 +1206,7 @@
         if (mbmi->compound_idx)
           assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
 
-        if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
+        if (cm->seq_params->order_hint_info.enable_dist_wtd_comp) {
           const int comp_index_ctx = get_comp_index_context(cm, xd);
           aom_write_symbol(w, mbmi->compound_idx,
                            ec_ctx->compound_index_cdf[comp_index_ctx], 2);
@@ -1237,7 +1239,7 @@
         }
       }
     }
-    write_mb_interp_filter(cm, xd, w);
+    write_mb_interp_filter(cm, td, w);
   }
 }
 
@@ -1267,23 +1269,23 @@
   const MB_MODE_INFO *const mbmi = xd->mi[0];
 
   if (seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, 0);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, 0);
 
   const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
 
   if (!seg->segid_preskip && seg->update_map)
-    write_segment_id(cpi, mbmi, w, seg, segp, skip);
+    write_segment_id(cpi, xd, mbmi, w, seg, segp, skip);
 
   write_cdef(cm, xd, w, skip);
 
-  write_delta_q_params(cpi, skip, w);
+  write_delta_q_params(cm, xd, skip, w);
 
   if (av1_allow_intrabc(cm)) {
     write_intrabc_info(xd, mbmi_ext_frame, w);
     if (is_intrabc_block(mbmi)) return;
   }
 
-  write_intra_prediction_modes(cpi, 1, w);
+  write_intra_prediction_modes(cm, xd, 1, w);
 }
 
 #if CONFIG_RD_DEBUG
@@ -1363,13 +1365,14 @@
 }
 #endif  // ENC_MISMATCH_DEBUG
 
-static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, aom_writer *w) {
+static AOM_INLINE void write_mbmi_b(AV1_COMP *cpi, ThreadData *const td,
+                                    aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   MB_MODE_INFO *m = xd->mi[0];
 
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext_frame, w);
+    write_mb_modes_kf(cpi, xd, td->mb.mbmi_ext_frame, w);
   } else {
     // has_subpel_mv_component needs the ref frame buffers set up to look
     // up if they are scaled. has_subpel_mv_component is in turn needed by
@@ -1380,7 +1383,7 @@
     enc_dump_logs(cm, &cpi->mbmi_ext_info, xd->mi_row, xd->mi_col);
 #endif  // ENC_MISMATCH_DEBUG
 
-    pack_inter_mode_mvs(cpi, w);
+    pack_inter_mode_mvs(cpi, td, w);
   }
 }
 
@@ -1413,18 +1416,17 @@
   for (int blk_row = row >> ss_y; blk_row < unit_height; blk_row += bkh) {
     for (int blk_col = col >> ss_x; blk_col < unit_width; blk_col += bkw) {
       pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
-                      cm->seq_params.bit_depth, *block, blk_row, blk_col,
+                      cm->seq_params->bit_depth, *block, blk_row, blk_col,
                       max_tx_size, token_stats);
       *block += step;
     }
   }
 }
 
-static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, aom_writer *w,
-                                      const TokenExtra **tok,
+static AOM_INLINE void write_tokens_b(AV1_COMP *cpi, MACROBLOCK *const x,
+                                      aom_writer *w, const TokenExtra **tok,
                                       const TokenExtra *const tok_end) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
@@ -1474,17 +1476,18 @@
   }
 }
 
-static AOM_INLINE void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                                     aom_writer *w, const TokenExtra **tok,
+static AOM_INLINE void write_modes_b(AV1_COMP *cpi, ThreadData *const td,
+                                     const TileInfo *const tile, aom_writer *w,
+                                     const TokenExtra **tok,
                                      const TokenExtra *const tok_end,
                                      int mi_row, int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *xd = &td->mb.e_mbd;
   FRAME_CONTEXT *tile_ctx = xd->tile_ctx;
   const int grid_idx = mi_row * mi_params->mi_stride + mi_col;
   xd->mi = mi_params->mi_grid_base + grid_idx;
-  cpi->td.mb.mbmi_ext_frame =
+  td->mb.mbmi_ext_frame =
       cpi->mbmi_ext_info.frame_base +
       get_mi_ext_idx(mi_row, mi_col, cm->mi_params.mi_alloc_bsize,
                      cpi->mbmi_ext_info.stride);
@@ -1493,7 +1496,7 @@
 
   const MB_MODE_INFO *mbmi = xd->mi[0];
   const BLOCK_SIZE bsize = mbmi->bsize;
-  assert(bsize <= cm->seq_params.sb_size ||
+  assert(bsize <= cm->seq_params->sb_size ||
          (bsize >= BLOCK_SIZES && bsize < BLOCK_SIZES_ALL));
 
   const int bh = mi_size_high[bsize];
@@ -1505,7 +1508,7 @@
   xd->left_txfm_context =
       xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
 
-  write_mbmi_b(cpi, w);
+  write_mbmi_b(cpi, td, w);
 
   for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
     const uint8_t palette_size_plane =
@@ -1554,10 +1557,10 @@
   if (!mbmi->skip_txfm) {
     int start = aom_tell_size(w);
 
-    write_tokens_b(cpi, w, tok, tok_end);
+    write_tokens_b(cpi, &td->mb, w, tok, tok_end);
 
     const int end = aom_tell_size(w);
-    cpi->rc.coefficient_size += end - start;
+    td->coefficient_size += end - start;
   }
 }
 
@@ -1599,12 +1602,12 @@
 }
 
 static AOM_INLINE void write_modes_sb(
-    AV1_COMP *const cpi, const TileInfo *const tile, aom_writer *const w,
-    const TokenExtra **tok, const TokenExtra *const tok_end, int mi_row,
-    int mi_col, BLOCK_SIZE bsize) {
+    AV1_COMP *const cpi, ThreadData *const td, const TileInfo *const tile,
+    aom_writer *const w, const TokenExtra **tok,
+    const TokenExtra *const tok_end, int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   assert(bsize < BLOCK_SIZES_ALL);
   const int hbs = mi_size_wide[bsize] / 2;
   const int quarter_step = mi_size_wide[bsize] / 4;
@@ -1626,8 +1629,7 @@
           const int runit_idx = rcol + rrow * rstride;
           const RestorationUnitInfo *rui =
               &cm->rst_info[plane].unit_info[runit_idx];
-          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
-                                           cpi->td.counts);
+          loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane, td->counts);
         }
       }
     }
@@ -1637,51 +1639,53 @@
   write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
   switch (partition) {
     case PARTITION_NONE:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       break;
     case PARTITION_HORZ:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_row + hbs < mi_params->mi_rows)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_VERT:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
       if (mi_col + hbs < mi_params->mi_cols)
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_SPLIT:
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col,
+                     subsize);
+      write_modes_sb(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
                      subsize);
       break;
     case PARTITION_HORZ_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
       break;
     case PARTITION_HORZ_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_VERT_A:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
       break;
     case PARTITION_VERT_B:
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
-      write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+      write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
       break;
     case PARTITION_HORZ_4:
       for (i = 0; i < 4; ++i) {
         int this_mi_row = mi_row + i * quarter_step;
         if (i > 0 && this_mi_row >= mi_params->mi_rows) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, this_mi_row, mi_col);
       }
       break;
     case PARTITION_VERT_4:
@@ -1689,7 +1693,7 @@
         int this_mi_col = mi_col + i * quarter_step;
         if (i > 0 && this_mi_col >= mi_params->mi_cols) break;
 
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+        write_modes_b(cpi, td, tile, w, tok, tok_end, mi_row, this_mi_col);
       }
       break;
     default: assert(0);
@@ -1699,12 +1703,12 @@
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
-static AOM_INLINE void write_modes(AV1_COMP *const cpi,
+static AOM_INLINE void write_modes(AV1_COMP *const cpi, ThreadData *const td,
                                    const TileInfo *const tile,
                                    aom_writer *const w, int tile_row,
                                    int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
   const int mi_row_start = tile->mi_row_start;
   const int mi_row_end = tile->mi_row_end;
   const int mi_col_start = tile->mi_col_start;
@@ -1722,9 +1726,9 @@
   }
 
   for (int mi_row = mi_row_start; mi_row < mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     const int sb_row_in_tile =
-        (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+        (mi_row - tile->mi_row_start) >> cm->seq_params->mib_size_log2;
     const TokenExtra *tok =
         cpi->token_info.tplist[tile_row][tile_col][sb_row_in_tile].start;
     const TokenExtra *tok_end =
@@ -1733,10 +1737,10 @@
     av1_zero_left_context(xd);
 
     for (int mi_col = mi_col_start; mi_col < mi_col_end;
-         mi_col += cm->seq_params.mib_size) {
-      cpi->td.mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
-      write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
-                     cm->seq_params.sb_size);
+         mi_col += cm->seq_params->mib_size) {
+      td->mb.cb_coef_buff = av1_get_cb_coeff_buffer(cpi, mi_row, mi_col);
+      write_modes_sb(cpi, td, tile, w, &tok, tok_end, mi_row, mi_col,
+                     cm->seq_params->sb_size);
     }
     assert(tok == tok_end);
   }
@@ -1745,7 +1749,7 @@
 static AOM_INLINE void encode_restoration_mode(
     AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   assert(!cm->features.all_lossless);
-  if (!cm->seq_params.enable_restoration) return;
+  if (!cm->seq_params->enable_restoration) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int all_none = 1, chroma_none = 1;
@@ -1776,9 +1780,9 @@
     }
   }
   if (!all_none) {
-    assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-           cm->seq_params.sb_size == BLOCK_128X128);
-    const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+    assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+           cm->seq_params->sb_size == BLOCK_128X128);
+    const int sb_size = cm->seq_params->sb_size == BLOCK_128X128 ? 128 : 64;
 
     RestorationInfo *rsi = &cm->rst_info[0];
 
@@ -1794,7 +1798,8 @@
   }
 
   if (num_planes > 1) {
-    int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+    int s =
+        AOMMIN(cm->seq_params->subsampling_x, cm->seq_params->subsampling_y);
     if (s && !chroma_none) {
       aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
                                cm->rst_info[0].restoration_unit_size);
@@ -2027,7 +2032,7 @@
 static AOM_INLINE void encode_cdef(const AV1_COMMON *cm,
                                    struct aom_write_bit_buffer *wb) {
   assert(!cm->features.coded_lossless);
-  if (!cm->seq_params.enable_cdef) return;
+  if (!cm->seq_params->enable_cdef) return;
   if (cm->features.allow_intrabc) return;
   const int num_planes = av1_num_planes(cm);
   int i;
@@ -2080,7 +2085,7 @@
   }
 }
 
-static AOM_INLINE void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+static AOM_INLINE void encode_segmentation(AV1_COMMON *cm,
                                            struct aom_write_bit_buffer *wb) {
   int i, j;
   struct segmentation *seg = &cm->seg;
@@ -2089,17 +2094,9 @@
   if (!seg->enabled) return;
 
   // Write update flags
-  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
-    assert(seg->update_map == 1);
-    seg->temporal_update = 0;
-    assert(seg->update_data == 1);
-  } else {
+  if (cm->features.primary_ref_frame != PRIMARY_REF_NONE) {
     aom_wb_write_bit(wb, seg->update_map);
-    if (seg->update_map) {
-      // Select the coding strategy (temporal or spatial)
-      av1_choose_segmap_coding_method(cm, xd);
-      aom_wb_write_bit(wb, seg->temporal_update);
-    }
+    if (seg->update_map) aom_wb_write_bit(wb, seg->temporal_update);
     aom_wb_write_bit(wb, seg->update_data);
   }
 
@@ -2150,11 +2147,11 @@
 static AOM_INLINE void write_tile_info_max_tile(
     const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) {
   int width_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params.mib_size_log2);
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, cm->seq_params->mib_size_log2);
   int height_mi =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int width_sb = width_mi >> cm->seq_params.mib_size_log2;
-  int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  int width_sb = width_mi >> cm->seq_params->mib_size_log2;
+  int height_sb = height_mi >> cm->seq_params->mib_size_log2;
   int size_sb, i;
   const CommonTileParams *const tiles = &cm->tiles;
 
@@ -2231,13 +2228,6 @@
   }
 }
 
-// Stores the location and size of a tile's data in the bitstream.  Used for
-// later identifying identical tiles
-typedef struct TileBufferEnc {
-  uint8_t *data;
-  size_t size;
-} TileBufferEnc;
-
 static INLINE int find_identical_tile(
     const int tile_row, const int tile_col,
     TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
@@ -2301,7 +2291,7 @@
 
 static AOM_INLINE void write_superres_scale(const AV1_COMMON *const cm,
                                             struct aom_write_bit_buffer *wb) {
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   if (!seq_params->enable_superres) {
     assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
     return;
@@ -2328,7 +2318,7 @@
   const int coded_height = cm->superres_upscaled_height - 1;
 
   if (frame_size_override) {
-    const SequenceHeader *seq_params = &cm->seq_params;
+    const SequenceHeader *seq_params = cm->seq_params;
     int num_bits_width = seq_params->num_bits_width;
     int num_bits_height = seq_params->num_bits_height;
     aom_wb_write_literal(wb, coded_width, num_bits_width);
@@ -2486,7 +2476,7 @@
                                          struct aom_write_bit_buffer *wb) {
   aom_wb_write_unsigned_literal(
       wb, cm->frame_presentation_time,
-      cm->seq_params.decoder_model_info.frame_presentation_time_length);
+      cm->seq_params->decoder_model_info.frame_presentation_time_length);
 }
 
 static AOM_INLINE void write_film_grain_params(
@@ -2508,7 +2498,7 @@
       assert(ref_idx != INVALID_IDX);
       const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
       if (buf->film_grain_params_present &&
-          av1_check_grain_params_equiv(pars, &buf->film_grain_params)) {
+          aom_check_grain_params_equiv(pars, &buf->film_grain_params)) {
         break;
       }
     }
@@ -2524,15 +2514,15 @@
     aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
   }
 
-  if (!cm->seq_params.monochrome) {
+  if (!cm->seq_params->monochrome) {
     aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
   } else {
     assert(!pars->chroma_scaling_from_luma);
   }
 
-  if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
-      ((cm->seq_params.subsampling_x == 1) &&
-       (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+  if (cm->seq_params->monochrome || pars->chroma_scaling_from_luma ||
+      ((cm->seq_params->subsampling_x == 1) &&
+       (cm->seq_params->subsampling_y == 1) && (pars->num_y_points == 0))) {
     assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
   } else {
     aom_wb_write_literal(wb, pars->num_cb_points, 4);  // max 10
@@ -2828,12 +2818,11 @@
 
 // New function based on HLS R18
 static AOM_INLINE void write_uncompressed_header_obu(
-    AV1_COMP *cpi, struct aom_write_bit_buffer *saved_wb,
+    AV1_COMP *cpi, MACROBLOCKD *const xd, struct aom_write_bit_buffer *saved_wb,
     struct aom_write_bit_buffer *wb) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const CommonQuantParams *quant_params = &cm->quant_params;
-  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   CurrentFrame *const current_frame = &cm->current_frame;
   FeatureFlags *const features = &cm->features;
 
@@ -2912,7 +2901,7 @@
 
     if (cm->superres_upscaled_width > seq_params->max_frame_width ||
         cm->superres_upscaled_height > seq_params->max_frame_height) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Frame dimensions are larger than the maximum values");
     }
 
@@ -2934,24 +2923,24 @@
   }
 
   if (seq_params->decoder_model_info_present_flag) {
-    aom_wb_write_bit(wb, cm->buffer_removal_time_present);
-    if (cm->buffer_removal_time_present) {
+    aom_wb_write_bit(wb, cpi->ppi->buffer_removal_time_present);
+    if (cpi->ppi->buffer_removal_time_present) {
       for (int op_num = 0;
            op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
         if (seq_params->op_params[op_num].decoder_model_param_present_flag) {
-          if (((seq_params->operating_point_idc[op_num] >>
+          if (seq_params->operating_point_idc[op_num] == 0 ||
+              ((seq_params->operating_point_idc[op_num] >>
                 cm->temporal_layer_id) &
                    0x1 &&
                (seq_params->operating_point_idc[op_num] >>
                 (cm->spatial_layer_id + 8)) &
-                   0x1) ||
-              seq_params->operating_point_idc[op_num] == 0) {
+                   0x1)) {
             aom_wb_write_unsigned_literal(
                 wb, cm->buffer_removal_times[op_num],
                 seq_params->decoder_model_info.buffer_removal_time_length);
             cm->buffer_removal_times[op_num]++;
             if (cm->buffer_removal_times[op_num] == 0) {
-              aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+              aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                                  "buffer_removal_time overflowed");
             }
           }
@@ -3038,7 +3027,7 @@
               1;
           if (delta_frame_id_minus_1 < 0 ||
               delta_frame_id_minus_1 >= (1 << diff_len)) {
-            aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_ERROR,
                                "Invalid delta_frame_id_minus_1");
           }
           aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
@@ -3075,8 +3064,8 @@
 
   write_tile_info(cm, saved_wb, wb);
   encode_quantization(quant_params, av1_num_planes(cm),
-                      cm->seq_params.separate_uv_delta_q, wb);
-  encode_segmentation(cm, xd, wb);
+                      cm->seq_params->separate_uv_delta_q, wb);
+  encode_segmentation(cm, wb);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag) assert(quant_params->base_qindex > 0);
@@ -3275,11 +3264,11 @@
 }
 
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst) {
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst) {
   if (level_params->keep_level_stats &&
       (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
-    ++level_params->frame_header_count;
+    ++(*frame_header_count);
 
   struct aom_write_bit_buffer wb = { dst, 0 };
   uint32_t size = 0;
@@ -3313,8 +3302,8 @@
   return AOM_CODEC_OK;
 }
 
-static size_t obu_memmove(size_t obu_header_size, size_t obu_payload_size,
-                          uint8_t *data) {
+size_t av1_obu_memmove(size_t obu_header_size, size_t obu_payload_size,
+                       uint8_t *data) {
   const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
   const size_t move_dst_offset = length_field_size + obu_header_size;
   const size_t move_src_offset = obu_header_size;
@@ -3413,12 +3402,12 @@
   return size;
 }
 
-static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+static uint32_t write_frame_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd,
                                        struct aom_write_bit_buffer *saved_wb,
                                        uint8_t *const dst,
                                        int append_trailing_bits) {
   struct aom_write_bit_buffer wb = { dst, 0 };
-  write_uncompressed_header_obu(cpi, saved_wb, &wb);
+  write_uncompressed_header_obu(cpi, xd, saved_wb, &wb);
   if (append_trailing_bits) add_trailing_bits(&wb);
   return aom_wb_bytes_written(&wb);
 }
@@ -3442,12 +3431,6 @@
   return size;
 }
 
-typedef struct {
-  uint8_t *frame_header;
-  size_t obu_header_byte_offset;
-  size_t total_length;
-} FrameHeaderInfo;
-
 extern void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
                                                 const char *filename);
 
@@ -3456,35 +3439,21 @@
   uint32_t frame_header_size;
 } LargeTileFrameOBU;
 
-typedef struct {
-  struct aom_write_bit_buffer *saved_wb;
-  TileBufferEnc buf;
-  uint32_t *obu_header_size;
-  uint32_t *total_size;
-  uint8_t *dst;
-  uint8_t *tile_data_curr;
-  uint8_t obu_extn_header;
-  int curr_tg_hdr_size;
-  int tile_row;
-  int tile_col;
-  int is_last_tile_in_tg;
-  int new_tg;
-} PackBSParams;
-
 // Initialize OBU header for large scale tile case.
 static uint32_t init_large_scale_tile_obu_header(
     AV1_COMP *const cpi, uint8_t **data, struct aom_write_bit_buffer *saved_wb,
     LargeTileFrameOBU *lst_obu) {
-  AV1LevelParams *const level_params = &cpi->level_params;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
   CurrentFrame *const current_frame = &cpi->common.current_frame;
   // For large_scale_tile case, we always have only one tile group, so it can
   // be written as an OBU_FRAME.
   const OBU_TYPE obu_type = OBU_FRAME;
-  lst_obu->tg_hdr_size = av1_write_obu_header(level_params, obu_type, 0, *data);
+  lst_obu->tg_hdr_size = av1_write_obu_header(
+      level_params, &cpi->frame_header_count, obu_type, 0, *data);
   *data += lst_obu->tg_hdr_size;
 
   const uint32_t frame_header_size =
-      write_frame_header_obu(cpi, saved_wb, *data, 0);
+      write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, saved_wb, *data, 0);
   *data += frame_header_size;
   lst_obu->frame_header_size = frame_header_size;
   // (yunqing) This test ensures the correctness of large scale tile coding.
@@ -3522,7 +3491,7 @@
   *total_size += lst_obu->tg_hdr_size;
   const uint32_t obu_payload_size = *total_size - lst_obu->tg_hdr_size;
   const size_t length_field_size =
-      obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
+      av1_obu_memmove(lst_obu->tg_hdr_size, obu_payload_size, dst);
   if (av1_write_uleb_obu_size(lst_obu->tg_hdr_size, obu_payload_size, dst) !=
       AOM_CODEC_OK)
     assert(0);
@@ -3553,6 +3522,7 @@
   const int tile_rows = tiles->rows;
   unsigned int tile_size = 0;
 
+  av1_reset_pack_bs_thread_data(&cpi->td);
   for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
     TileInfo tile_info;
     const int is_last_col = (tile_col == tile_cols - 1);
@@ -3581,7 +3551,7 @@
       mode_bc.allow_update_cdf =
           mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
       aom_start_encode(&mode_bc, buf->data + data_offset);
-      write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+      write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col);
       aom_stop_encode(&mode_bc);
       tile_size = mode_bc.pos;
       buf->size = tile_size;
@@ -3629,6 +3599,7 @@
       *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
     }
   }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
 }
 
 // Packs information in the obu header for large scale tiles.
@@ -3658,18 +3629,45 @@
   return total_size;
 }
 
-// Pack tile data in the bitstream with tile_group, frame
-// and OBU header.
-static void pack_tile_info(AV1_COMP *const cpi,
-                           PackBSParams *const pack_bs_params) {
-  aom_writer mode_bc;
+// Writes obu, tile group and uncompressed headers to bitstream.
+void av1_write_obu_tg_tile_headers(AV1_COMP *const cpi, MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx) {
   AV1_COMMON *const cm = &cpi->common;
   const CommonTileParams *const tiles = &cm->tiles;
+  int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
+  const int tg_size =
+      (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
+
+  // Write Tile group, frame and OBU header
+  // A new tile group begins at this tile.  Write the obu header and
+  // tile group header
+  const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+  *curr_tg_hdr_size = av1_write_obu_header(
+      &cpi->ppi->level_params, &cpi->frame_header_count, obu_type,
+      pack_bs_params->obu_extn_header, pack_bs_params->tile_data_curr);
+  pack_bs_params->obu_header_size = *curr_tg_hdr_size;
+
+  if (cpi->num_tg == 1)
+    *curr_tg_hdr_size += write_frame_header_obu(
+        cpi, xd, pack_bs_params->saved_wb,
+        pack_bs_params->tile_data_curr + *curr_tg_hdr_size, 0);
+  *curr_tg_hdr_size += write_tile_group_header(
+      pack_bs_params->tile_data_curr + *curr_tg_hdr_size, tile_idx,
+      AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
+      (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
+  *pack_bs_params->total_size += *curr_tg_hdr_size;
+}
+
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(AV1_COMP *const cpi, ThreadData *const td,
+                        PackBSParams *const pack_bs_params) {
+  aom_writer mode_bc;
+  AV1_COMMON *const cm = &cpi->common;
   int tile_row = pack_bs_params->tile_row;
   int tile_col = pack_bs_params->tile_col;
-  int *const curr_tg_hdr_size = &pack_bs_params->curr_tg_hdr_size;
   uint32_t *const total_size = pack_bs_params->total_size;
-  uint8_t *tile_data_curr = pack_bs_params->tile_data_curr;
   TileInfo tile_info;
   av1_tile_set_col(&tile_info, cm, tile_col);
   av1_tile_set_row(&tile_info, cm, tile_row);
@@ -3677,30 +3675,10 @@
   mode_bc.allow_update_cdf =
       mode_bc.allow_update_cdf && !cm->features.disable_cdf_update;
 
-  const int tile_idx = tile_row * tiles->cols + tile_col;
-  const int tg_size =
-      (tiles->rows * tiles->cols + cpi->num_tg - 1) / cpi->num_tg;
   unsigned int tile_size;
 
-  // Write Tile group, frame and OBU header
-  if (pack_bs_params->new_tg) {
-    // A new tile group begins at this tile.  Write the obu header and
-    // tile group header
-    const OBU_TYPE obu_type = (cpi->num_tg == 1) ? OBU_FRAME : OBU_TILE_GROUP;
-    *curr_tg_hdr_size =
-        av1_write_obu_header(&cpi->level_params, obu_type,
-                             pack_bs_params->obu_extn_header, tile_data_curr);
-    *pack_bs_params->obu_header_size = *curr_tg_hdr_size;
-
-    if (cpi->num_tg == 1)
-      *curr_tg_hdr_size += write_frame_header_obu(
-          cpi, pack_bs_params->saved_wb, tile_data_curr + *curr_tg_hdr_size, 0);
-    *curr_tg_hdr_size += write_tile_group_header(
-        tile_data_curr + *curr_tg_hdr_size, tile_idx,
-        AOMMIN(tile_idx + tg_size - 1, tiles->cols * tiles->rows - 1),
-        (tiles->log2_rows + tiles->log2_cols), cpi->num_tg > 1);
-    *total_size += *curr_tg_hdr_size;
-  }
+  const int num_planes = av1_num_planes(cm);
+  av1_reset_loop_restoration(&td->mb.e_mbd, num_planes);
 
   pack_bs_params->buf.data = pack_bs_params->dst + *total_size;
 
@@ -3709,7 +3687,7 @@
 
   // Pack tile data
   aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size);
-  write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+  write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col);
   aom_stop_encode(&mode_bc);
   tile_size = mode_bc.pos;
   assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
@@ -3723,6 +3701,80 @@
   }
 }
 
+void av1_write_last_tile_info(
+    AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header) {
+  // write current tile group size
+  const uint32_t obu_payload_size =
+      (uint32_t)(*curr_tg_data_size) - obu_header_size;
+  const size_t length_field_size =
+      av1_obu_memmove(obu_header_size, obu_payload_size, curr_tg_start);
+  if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size,
+                              curr_tg_start) != AOM_CODEC_OK) {
+    assert(0);
+  }
+  *curr_tg_data_size += (int)length_field_size;
+  *total_size += (uint32_t)length_field_size;
+  *tile_data_start += length_field_size;
+  if (cpi->num_tg == 1) {
+    // if this tg is combined with the frame header then update saved
+    // frame header base offset according to length field size
+    saved_wb->bit_buffer += length_field_size;
+  }
+
+  if (!(*is_first_tg) && cpi->common.features.error_resilient_mode) {
+    // Make room for a duplicate Frame Header OBU.
+    memmove(curr_tg_start + fh_info->total_length, curr_tg_start,
+            *curr_tg_data_size);
+
+    // Insert a copy of the Frame Header OBU.
+    memcpy(curr_tg_start, fh_info->frame_header, fh_info->total_length);
+
+    // Force context update tile to be the first tile in error
+    // resilient mode as the duplicate frame headers will have
+    // context_update_tile_id set to 0
+    *largest_tile_id = 0;
+
+    // Rewrite the OBU header to change the OBU type to Redundant Frame
+    // Header.
+    av1_write_obu_header(&cpi->ppi->level_params, &cpi->frame_header_count,
+                         OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
+                         &curr_tg_start[fh_info->obu_header_byte_offset]);
+
+    *curr_tg_data_size += (int)(fh_info->total_length);
+    *total_size += (uint32_t)(fh_info->total_length);
+  }
+  *is_first_tg = 0;
+}
+
+void av1_reset_pack_bs_thread_data(ThreadData *const td) {
+  td->coefficient_size = 0;
+  td->max_mv_magnitude = 0;
+  av1_zero(td->interp_filter_selected);
+}
+
+void av1_accumulate_pack_bs_thread_data(AV1_COMP *const cpi,
+                                        ThreadData const *td) {
+  int do_max_mv_magnitude_update = 1;
+  cpi->rc.coefficient_size += td->coefficient_size;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Disable max_mv_magnitude update for parallel frames based on update flag.
+  if (!cpi->do_frame_data_update) do_max_mv_magnitude_update = 0;
+#endif
+
+  if (cpi->sf.mv_sf.auto_mv_step_size && do_max_mv_magnitude_update)
+    cpi->mv_search_params.max_mv_magnitude =
+        AOMMAX(cpi->mv_search_params.max_mv_magnitude, td->max_mv_magnitude);
+
+  for (InterpFilter filter = EIGHTTAP_REGULAR; filter < SWITCHABLE; filter++)
+    cpi->common.cur_frame->interp_filter_selected[filter] +=
+        td->interp_filter_selected[filter];
+}
+
 // Store information related to each default tile in the OBU header.
 static void write_tile_obu(
     AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
@@ -3731,6 +3783,7 @@
     unsigned int *max_tile_size, uint32_t *const obu_header_size,
     uint8_t **tile_data_start) {
   AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   const CommonTileParams *const tiles = &cm->tiles;
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
@@ -3743,6 +3796,7 @@
   int new_tg = 1;
   int is_first_tg = 1;
 
+  av1_reset_pack_bs_thread_data(&cpi->td);
   for (int tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (int tile_col = 0; tile_col < tile_cols; tile_col++) {
       const int tile_idx = tile_row * tile_cols + tile_col;
@@ -3758,9 +3812,7 @@
       if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1))
         is_last_tile_in_tg = 1;
 
-      cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
-      const int num_planes = av1_num_planes(cm);
-      av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+      xd->tile_ctx = &this_tile->tctx;
 
       // PackBSParams stores all parameters required to pack tile and header
       // info.
@@ -3770,18 +3822,22 @@
       pack_bs_params.is_last_tile_in_tg = is_last_tile_in_tg;
       pack_bs_params.new_tg = new_tg;
       pack_bs_params.obu_extn_header = obu_extn_header;
-      pack_bs_params.obu_header_size = obu_header_size;
+      pack_bs_params.obu_header_size = 0;
       pack_bs_params.saved_wb = saved_wb;
       pack_bs_params.tile_col = tile_col;
       pack_bs_params.tile_row = tile_row;
       pack_bs_params.tile_data_curr = tile_data_curr;
       pack_bs_params.total_size = total_size;
 
-      pack_tile_info(cpi, &pack_bs_params);
+      if (new_tg)
+        av1_write_obu_tg_tile_headers(cpi, xd, &pack_bs_params, tile_idx);
+
+      av1_pack_tile_info(cpi, &cpi->td, &pack_bs_params);
 
       if (new_tg) {
         curr_tg_data_size = pack_bs_params.curr_tg_hdr_size;
         *tile_data_start += pack_bs_params.curr_tg_hdr_size;
+        *obu_header_size = pack_bs_params.obu_header_size;
         new_tg = 0;
       }
       if (is_last_tile_in_tg) new_tg = 1;
@@ -3794,53 +3850,15 @@
         *max_tile_size = (unsigned int)pack_bs_params.buf.size;
       }
 
-      if (is_last_tile_in_tg) {
-        // write current tile group size
-        const uint32_t obu_payload_size =
-            (uint32_t)curr_tg_data_size - *obu_header_size;
-        const size_t length_field_size =
-            obu_memmove(*obu_header_size, obu_payload_size, tile_data_curr);
-        if (av1_write_uleb_obu_size(*obu_header_size, obu_payload_size,
-                                    tile_data_curr) != AOM_CODEC_OK) {
-          assert(0);
-        }
-        curr_tg_data_size += (int)length_field_size;
-        *total_size += (uint32_t)length_field_size;
-        *tile_data_start += length_field_size;
-        if (num_tg_hdrs == 1) {
-          // if this tg is combined with the frame header then update saved
-          // frame header base offset accroding to length field size
-          saved_wb->bit_buffer += length_field_size;
-        }
-
-        if (!is_first_tg && cm->features.error_resilient_mode) {
-          // Make room for a duplicate Frame Header OBU.
-          memmove(tile_data_curr + fh_info->total_length, tile_data_curr,
-                  curr_tg_data_size);
-
-          // Insert a copy of the Frame Header OBU.
-          memcpy(tile_data_curr, fh_info->frame_header, fh_info->total_length);
-
-          // Force context update tile to be the first tile in error
-          // resiliant mode as the duplicate frame headers will have
-          // context_update_tile_id set to 0
-          *largest_tile_id = 0;
-
-          // Rewrite the OBU header to change the OBU type to Redundant Frame
-          // Header.
-          av1_write_obu_header(
-              &cpi->level_params, OBU_REDUNDANT_FRAME_HEADER, obu_extn_header,
-              &tile_data_curr[fh_info->obu_header_byte_offset]);
-          tile_data_curr += fh_info->total_length;
-
-          curr_tg_data_size += (int)(fh_info->total_length);
-          *total_size += (uint32_t)(fh_info->total_length);
-        }
-        is_first_tg = 0;
-      }
+      if (is_last_tile_in_tg)
+        av1_write_last_tile_info(cpi, fh_info, saved_wb, &curr_tg_data_size,
+                                 tile_data_curr, total_size, tile_data_start,
+                                 largest_tile_id, &is_first_tg,
+                                 *obu_header_size, obu_extn_header);
       *total_size += (uint32_t)pack_bs_params.buf.size;
     }
   }
+  av1_accumulate_pack_bs_thread_data(cpi, &cpi->td);
 }
 
 // Write total buffer size and related information into the OBU header for
@@ -3898,6 +3916,40 @@
   }
 }
 
+// As per the experiments, single-thread bitstream packing is better for
+// frames with a smaller bitstream size. This behavior is due to setup time
+// overhead of multithread function would be more than that of time required
+// to pack the smaller bitstream of such frames. This function computes the
+// number of required number of workers based on setup time overhead and job
+// dispatch time overhead for given tiles and available workers.
+int calc_pack_bs_mt_workers(const TileDataEnc *tile_data, int num_tiles,
+                            int avail_workers) {
+  if (AOMMIN(avail_workers, num_tiles) <= 1) return 1;
+
+  uint64_t frame_abs_sum_level = 0;
+
+  for (int idx = 0; idx < num_tiles; idx++)
+    frame_abs_sum_level += tile_data[idx].abs_sum_level;
+
+  int ideal_num_workers = 1;
+  const float job_disp_time_const = (float)num_tiles * JOB_DISP_TIME_OH_CONST;
+  float max_sum = 0.0;
+
+  for (int num_workers = avail_workers; num_workers > 1; num_workers--) {
+    const float fas_per_worker_const =
+        ((float)(num_workers - 1) / num_workers) * frame_abs_sum_level;
+    const float setup_time_const = (float)num_workers * SETUP_TIME_OH_CONST;
+    const float this_sum = fas_per_worker_const - setup_time_const -
+                           job_disp_time_const / num_workers;
+
+    if (this_sum > max_sum) {
+      max_sum = this_sum;
+      ideal_num_workers = num_workers;
+    }
+  }
+  return ideal_num_workers;
+}
+
 static INLINE uint32_t pack_tiles_in_tg_obus(
     AV1_COMP *const cpi, uint8_t *const dst,
     struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header,
@@ -3907,16 +3959,24 @@
   unsigned int max_tile_size = 0;
   uint32_t obu_header_size = 0;
   uint8_t *tile_data_start = dst;
-
-  write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header, fh_info,
-                 largest_tile_id, &max_tile_size, &obu_header_size,
-                 &tile_data_start);
-
   const int tile_cols = tiles->cols;
   const int tile_rows = tiles->rows;
-  const int have_tiles = tile_cols * tile_rows > 1;
+  const int num_tiles = tile_rows * tile_cols;
 
-  if (have_tiles)
+  const int num_workers = calc_pack_bs_mt_workers(
+      cpi->tile_data, num_tiles, cpi->mt_info.num_mod_workers[MOD_PACK_BS]);
+
+  if (num_workers > 1) {
+    av1_write_tile_obu_mt(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                          fh_info, largest_tile_id, &max_tile_size,
+                          &obu_header_size, &tile_data_start, num_workers);
+  } else {
+    write_tile_obu(cpi, dst, &total_size, saved_wb, obu_extension_header,
+                   fh_info, largest_tile_id, &max_tile_size, &obu_header_size,
+                   &tile_data_start);
+  }
+
+  if (num_tiles > 1)
     write_tile_obu_size(cpi, dst, saved_wb, *largest_tile_id, &total_size,
                         max_tile_size, obu_header_size, tile_data_start);
   return total_size;
@@ -3931,6 +3991,9 @@
   const CommonTileParams *const tiles = &cm->tiles;
   *largest_tile_id = 0;
 
+  // Select the coding strategy (temporal or spatial)
+  if (cm->seg.enabled) av1_choose_segmap_coding_method(cm, &cpi->td.mb.e_mbd);
+
   if (tiles->large_scale)
     return pack_large_scale_tiles_in_tg_obus(cpi, dst, saved_wb,
                                              largest_tile_id);
@@ -3970,18 +4033,20 @@
           (cm->current_frame.frame_type != KEY_FRAME &&
            current_metadata->insert_flag == AOM_MIF_NON_KEY_FRAME) ||
           current_metadata->insert_flag == AOM_MIF_ANY_FRAME) {
-        obu_header_size =
-            av1_write_obu_header(&cpi->level_params, OBU_METADATA, 0, dst);
+        obu_header_size = av1_write_obu_header(&cpi->ppi->level_params,
+                                               &cpi->frame_header_count,
+                                               OBU_METADATA, 0, dst);
         obu_payload_size =
             av1_write_metadata_obu(current_metadata, dst + obu_header_size);
-        length_field_size = obu_memmove(obu_header_size, obu_payload_size, dst);
+        length_field_size =
+            av1_obu_memmove(obu_header_size, obu_payload_size, dst);
         if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, dst) ==
             AOM_CODEC_OK) {
           const size_t obu_size = obu_header_size + obu_payload_size;
           dst += obu_size + length_field_size;
           total_bytes_written += obu_size + length_field_size;
         } else {
-          aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+          aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
                              "Error writing metadata OBU size");
         }
       }
@@ -3995,7 +4060,7 @@
   uint8_t *data = dst;
   uint32_t data_size;
   AV1_COMMON *const cm = &cpi->common;
-  AV1LevelParams *const level_params = &cpi->level_params;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
   uint32_t obu_header_size = 0;
   uint32_t obu_payload_size = 0;
   FrameHeaderInfo fh_info = { NULL, 0, 0 };
@@ -4011,19 +4076,20 @@
   bitstream_queue_reset_write();
 #endif
 
-  level_params->frame_header_count = 0;
+  cpi->frame_header_count = 0;
 
   // The TD is now written outside the frame encode loop
 
-  // write sequence header obu if KEY_FRAME, preceded by 4-byte size
-  if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
-    obu_header_size =
-        av1_write_obu_header(level_params, OBU_SEQUENCE_HEADER, 0, data);
+  // write sequence header obu at each key frame, preceded by 4-byte size
+  if (cm->current_frame.frame_type == KEY_FRAME &&
+      cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+    obu_header_size = av1_write_obu_header(
+        level_params, &cpi->frame_header_count, OBU_SEQUENCE_HEADER, 0, data);
 
     obu_payload_size =
-        av1_write_sequence_header_obu(&cm->seq_params, data + obu_header_size);
+        av1_write_sequence_header_obu(cm->seq_params, data + obu_header_size);
     const size_t length_field_size =
-        obu_memmove(obu_header_size, obu_payload_size, data);
+        av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -4042,12 +4108,13 @@
   if (write_frame_header) {
     // Write Frame Header OBU.
     fh_info.frame_header = data;
-    obu_header_size = av1_write_obu_header(level_params, OBU_FRAME_HEADER,
-                                           obu_extension_header, data);
-    obu_payload_size =
-        write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+    obu_header_size =
+        av1_write_obu_header(level_params, &cpi->frame_header_count,
+                             OBU_FRAME_HEADER, obu_extension_header, data);
+    obu_payload_size = write_frame_header_obu(cpi, &cpi->td.mb.e_mbd, &saved_wb,
+                                              data + obu_header_size, 1);
 
-    length_field = obu_memmove(obu_header_size, obu_payload_size, data);
+    length_field = av1_obu_memmove(obu_header_size, obu_payload_size, data);
     if (av1_write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;

diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index df35ecc..e32cd3b 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h

@@ -16,9 +16,67 @@
 extern "C" {
 #endif
 
-#include "av1/encoder/encoder.h"
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/encoder/level.h"
+#include "aom_dsp/bitwriter.h"
 
 struct aom_write_bit_buffer;
+struct AV1_COMP;
+struct ThreadData;
+
+/*!\cond */
+
+// Stores the location and size of a tile's data in the bitstream.  Used for
+// later identifying identical tiles
+typedef struct {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
+typedef struct {
+  uint8_t *frame_header;
+  size_t obu_header_byte_offset;
+  size_t total_length;
+} FrameHeaderInfo;
+
+typedef struct {
+  struct aom_write_bit_buffer *saved_wb;  // Bit stream buffer writer structure
+  TileBufferEnc buf;     // Structure to hold bitstream buffer and size
+  uint32_t *total_size;  // Size of the bitstream buffer for the tile in bytes
+  uint8_t *dst;          // Base address of tile bitstream buffer
+  uint8_t *tile_data_curr;   // Base address of tile-group bitstream buffer
+  size_t tile_buf_size;      // Available bitstream buffer for the tile in bytes
+  uint8_t obu_extn_header;   // Presence of OBU extension header
+  uint32_t obu_header_size;  // Size of the OBU header
+  int curr_tg_hdr_size;      // Size of the obu, tg, frame headers
+  int tile_size_mi;          // Tile size in mi units
+  int tile_row;              // Number of tile rows
+  int tile_col;              // Number of tile columns
+  int is_last_tile_in_tg;    // Flag to indicate last tile in a tile-group
+  int new_tg;                // Flag to indicate starting of a new tile-group
+} PackBSParams;
+
+typedef struct {
+  uint64_t abs_sum_level;
+  uint16_t tile_idx;
+} PackBSTileOrder;
+
+// Pack bitstream data for pack bitstream multi-threading.
+typedef struct {
+#if CONFIG_MULTITHREAD
+  // Mutex lock used while dispatching jobs.
+  pthread_mutex_t *mutex_;
+#endif
+  // Tile order structure of pack bitstream multithreading.
+  PackBSTileOrder pack_bs_tile_order[MAX_TILES];
+
+  // Index of next job to be processed.
+  int next_job_idx;
+} AV1EncPackBSSync;
+
+/*!\endcond */
 
 // Writes only the OBU Sequence Header payload, and returns the size of the
 // payload written to 'dst'. This function does not write the OBU header, the
@@ -29,23 +87,44 @@
 // Writes the OBU header byte, and the OBU header extension byte when
 // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
 uint32_t av1_write_obu_header(AV1LevelParams *const level_params,
-                              OBU_TYPE obu_type, int obu_extension,
-                              uint8_t *const dst);
+                              int *frame_header_count, OBU_TYPE obu_type,
+                              int obu_extension, uint8_t *const dst);
 
 int av1_write_uleb_obu_size(size_t obu_header_size, size_t obu_payload_size,
                             uint8_t *dest);
 
+// Pack tile data in the bitstream with tile_group, frame
+// and OBU header.
+void av1_pack_tile_info(struct AV1_COMP *const cpi, struct ThreadData *const td,
+                        PackBSParams *const pack_bs_params);
+
+void av1_write_last_tile_info(
+    struct AV1_COMP *const cpi, const FrameHeaderInfo *fh_info,
+    struct aom_write_bit_buffer *saved_wb, size_t *curr_tg_data_size,
+    uint8_t *curr_tg_start, uint32_t *const total_size,
+    uint8_t **tile_data_start, int *const largest_tile_id,
+    int *const is_first_tg, uint32_t obu_header_size, uint8_t obu_extn_header);
+
 /*!\brief Pack the bitstream for one frame
  *
  * \ingroup high_level_algo
  * \callgraph
  */
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+int av1_pack_bitstream(struct AV1_COMP *const cpi, uint8_t *dst, size_t *size,
                        int *const largest_tile_id);
 
 void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
                        TX_TYPE tx_type, TX_SIZE tx_size, aom_writer *w);
 
+void av1_reset_pack_bs_thread_data(struct ThreadData *const td);
+
+void av1_accumulate_pack_bs_thread_data(struct AV1_COMP *const cpi,
+                                        struct ThreadData const *td);
+
+void av1_write_obu_tg_tile_headers(struct AV1_COMP *const cpi,
+                                   MACROBLOCKD *const xd,
+                                   PackBSParams *const pack_bs_params,
+                                   const int tile_idx);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 91df25d..9696859 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h

@@ -26,7 +26,7 @@
 #include "av1/encoder/partition_cnn_weights.h"
 #endif
 
-#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -45,6 +45,8 @@
 //! Number of txfm hash records kept for the txfm block.
 #define TX_SIZE_RD_RECORD_BUFFER_LEN 256
 
+/*! Maximum value taken by transform type probabilities */
+#define MAX_TX_TYPE_PROB 1024
 /*! \brief Superblock level encoder info
  *
  * SuperblockEnc stores superblock level information used by the encoder for
@@ -250,8 +252,8 @@
  */
 typedef struct {
   //! Circular buffer that stores the txfm search results.
-  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];  // Circular buffer.
-  //! Index to insert the newest \ref TXB_RD_INFO.
+  MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN];
+  //! Index to insert the newest rd record.
   int index_start;
   //! Number of info stored in this record.
   int num;
@@ -259,44 +261,6 @@
   CRC32C crc_calculator;
 } MB_RD_RECORD;
 
-/*! \brief Txfm search results for a tx block.
- */
-typedef struct {
-  //! Distortion after the txfm process
-  int64_t dist;
-  //! SSE of the prediction before the txfm process
-  int64_t sse;
-  //! Rate used to encode the txfm.
-  int rate;
-  //! Location of the end of non-zero entries.
-  uint16_t eob;
-  //! Transform type used on the current block.
-  TX_TYPE tx_type;
-  //! Unknown usage
-  uint16_t entropy_context;
-  //! Context used to code the coefficients.
-  uint8_t txb_entropy_ctx;
-  //! Whether the current info block contains  valid info
-  uint8_t valid;
-  //! Unused
-  uint8_t fast;
-  //! Whether trellis optimization is done.
-  uint8_t perform_block_coeff_opt;
-} TXB_RD_INFO;
-
-/*! \brief Hash records of txfm search result for each tx block.
- */
-typedef struct {
-  //! The hash values.
-  uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  //! The txfm search results
-  TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
-  //! Index to insert the newest \ref TXB_RD_INFO.
-  int index_start;
-  //! Number of info stored in this record.
-  int num;
-} TXB_RD_RECORD;
-
 //! Number of compound rd stats
 #define MAX_COMP_RD_STATS 64
 /*! \brief Rdcost stats in compound mode.
@@ -429,11 +393,9 @@
    * features.
    */
   int use_default_intra_tx_type;
-  /*! \brief Whether to limit the inter txfm search type to the default txfm.
-   *
-   * \copydetails use_default_intra_tx_type
-   */
-  int use_default_inter_tx_type;
+
+  /*! Probability threshold used for conditionally forcing tx type*/
+  int default_inter_tx_type_prob_thresh;
 
   //! Whether to prune 2d transforms based on 1d transform results.
   int prune_2d_txfm_mode;
@@ -499,16 +461,6 @@
   //! Txfm hash record for the whole coding block.
   MB_RD_RECORD mb_rd_record;
 
-  //! Inter mode txfm hash record for TX_8X8 blocks.
-  TXB_RD_RECORD txb_rd_record_8X8[MAX_NUM_8X8_TXBS];
-  //! Inter mode txfm hash record for TX_16X16 blocks.
-  TXB_RD_RECORD txb_rd_record_16X16[MAX_NUM_16X16_TXBS];
-  //! Inter mode txfm hash record for TX_32X32 blocks.
-  TXB_RD_RECORD txb_rd_record_32X32[MAX_NUM_32X32_TXBS];
-  //! Inter mode txfm hash record for TX_64X64 blocks.
-  TXB_RD_RECORD txb_rd_record_64X64[MAX_NUM_64X64_TXBS];
-  //! Intra mode txfm hash record for square tx blocks.
-  TXB_RD_RECORD txb_rd_record_intra;
   /**@}*/
 } TxbRdRecords;
 
@@ -834,6 +786,14 @@
   int lighting_change;
   int low_sumdiff;
 } CONTENT_STATE_SB;
+
+// Structure to hold pixel level gradient info.
+typedef struct {
+  uint16_t abs_dx_abs_dy_sum;
+  int8_t hist_bin_idx;
+  bool is_dx_zero;
+} PixelLevelGradientInfo;
+
 /*!\endcond */
 
 /*! \brief Encoder's parameters related to the current coding block.
@@ -1036,6 +996,10 @@
   int pred_mv_sad[REF_FRAMES];
   //! The minimum of \ref pred_mv_sad.
   int best_pred_mv_sad;
+  //! The sad of the 1st mv ref (nearest).
+  int pred_mv0_sad[REF_FRAMES];
+  //! The sad of the 2nd mv ref (near).
+  int pred_mv1_sad[REF_FRAMES];
 
   /*! \brief Disables certain ref frame pruning based on tpl.
    *
@@ -1173,6 +1137,15 @@
   /*! \brief The mode to reuse during \ref av1_rd_pick_intra_mode_sb and
    *  \ref av1_rd_pick_inter_mode. */
   const MB_MODE_INFO *mb_mode_cache;
+  /*! \brief Pointer to the buffer which caches gradient information.
+   *
+   * Pointer to the array of structures to store gradient information of each
+   * pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+   * structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+   */
+  PixelLevelGradientInfo *pixel_gradient_info;
+  /*! \brief Flags indicating the availability of cached gradient info. */
+  bool is_sb_gradient_cached[PLANE_TYPES];
   /**@}*/
 
   /*****************************************************************************
@@ -1217,6 +1190,8 @@
    * Used in REALTIME coding mode to enhance the visual quality at the boundary
    * of moving color objects.
    */
+  uint8_t color_sensitivity_sb[2];
+  //! Color sensitivity flag for the coding block.
   uint8_t color_sensitivity[2];
   /**@}*/
 
@@ -1233,10 +1208,39 @@
   DECLARE_ALIGNED(16, uint8_t, est_pred[128 * 128]);
 #endif
   /**@}*/
+
+  /*! \brief NONE partition evaluated for merge.
+   *
+   * In variance based partitioning scheme, NONE & SPLIT partitions are
+   * evaluated to check the SPLIT can be merged as NONE. This flag signifies the
+   * partition is evaluated in the scheme.
+   */
+  int try_merge_partition;
 } MACROBLOCK;
 #undef SINGLE_REF_MODES
 
 /*!\cond */
+// Zeroes out 'n_stats' elements in the array x->winner_mode_stats.
+// It only zeroes out what is necessary in 'color_index_map' (just the block
+// size, not the whole array).
+static INLINE void zero_winner_mode_stats(BLOCK_SIZE bsize, int n_stats,
+                                          WinnerModeStats *stats) {
+  const int block_height = block_size_high[bsize];
+  const int block_width = block_size_wide[bsize];
+  for (int i = 0; i < n_stats; ++i) {
+    WinnerModeStats *const stat = &stats[i];
+    memset(&stat->mbmi, 0, sizeof(stat->mbmi));
+    memset(&stat->rd_cost, 0, sizeof(stat->rd_cost));
+    memset(&stat->rd, 0, sizeof(stat->rd));
+    memset(&stat->rate_y, 0, sizeof(stat->rate_y));
+    memset(&stat->rate_uv, 0, sizeof(stat->rate_uv));
+    // Do not reset the whole array as it is CPU intensive.
+    memset(&stat->color_index_map, 0,
+           block_width * block_height * sizeof(stat->color_index_map[0]));
+    memset(&stat->mode_index, 0, sizeof(stat->mode_index));
+  }
+}
+
 static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
   static const char LUT[BLOCK_SIZES_ALL] = {
     0,  // BLOCK_4X4

diff --git a/av1/encoder/blockiness.c b/av1/encoder/blockiness.c
index f7cff9e..6ad2dda 100644
--- a/av1/encoder/blockiness.c
+++ b/av1/encoder/blockiness.c

@@ -18,7 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
@@ -125,7 +124,6 @@
                           int height) {
   double blockiness = 0;
   int i, j;
-  aom_clear_system_state();
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
     for (j = 0; j < width; j += 4) {

diff --git a/av1/encoder/compound_type.c b/av1/encoder/compound_type.c
index 93f7d1f..00fa389 100644
--- a/av1/encoder/compound_type.c
+++ b/av1/encoder/compound_type.c

@@ -166,14 +166,14 @@
   // TODO(nithya): Sign estimation assumes 45 degrees (1st and 4th quadrants)
   // for all codebooks; experiment with other quadrant combinations for
   // 0, 90 and 135 degrees also.
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred0 + bh_by2 * stride0 + bw_by2, stride0,
-                          &esq[0][1]);
-  cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
-  cpi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
-                          pred1 + bh_by2 * stride1 + bw_by2, stride0,
-                          &esq[1][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred0 + bh_by2 * stride0 + bw_by2, stride0,
+                               &esq[0][1]);
+  cpi->ppi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  cpi->ppi->fn_ptr[f_index].vf(src + bh_by2 * src_stride + bw_by2, src_stride,
+                               pred1 + bh_by2 * stride1 + bw_by2, stride0,
+                               &esq[1][1]);
 
   tl = ((int64_t)esq[0][0]) - ((int64_t)esq[1][0]);
   br = ((int64_t)esq[1][1]) - ((int64_t)esq[0][1]);
@@ -314,7 +314,7 @@
   int8_t wedge_sign = 0;
 
   assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
-  assert(cpi->common.seq_params.enable_masked_compound);
+  assert(cpi->common.seq_params->enable_masked_compound);
 
   if (cpi->sf.inter_sf.fast_wedge_sign_estimate) {
     wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
@@ -392,7 +392,7 @@
   const MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(av1_is_wedge_used(bsize));
-  assert(cpi->common.seq_params.enable_interintra_compound);
+  assert(cpi->common.seq_params->enable_interintra_compound);
 
   const struct buf_2d *const src = &x->plane[0].src;
   const int bw = block_size_wide[bsize];
@@ -836,7 +836,7 @@
   const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
   const int try_distwtd_comp =
       ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
-       cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+       cm->seq_params->order_hint_info.enable_dist_wtd_comp == 1 &&
        cpi->sf.inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
 
   // Check if COMPOUND_AVERAGE and COMPOUND_DISTWTD are valid cases
@@ -1058,10 +1058,12 @@
   if (compound_type == COMPOUND_WEDGE) {
     unsigned int sse;
     if (is_cur_buf_hbd(xd))
-      (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
-                                  CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
+                                       CONVERT_TO_BYTEPTR(*preds1), *strides,
+                                       &sse);
     else
-      (void)cpi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides, &sse);
+      (void)cpi->ppi->fn_ptr[bsize].vf(*preds0, *strides, *preds1, *strides,
+                                       &sse);
     const unsigned int mse =
         ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
     // If two predictors are very similar, skip wedge compound mode search
@@ -1339,7 +1341,7 @@
         if (have_newmv_in_inter_mode(this_mode)) {
           InterPredParams inter_pred_params;
           av1_dist_wtd_comp_weight_assign(
-              &cpi->common, mbmi, 0, &inter_pred_params.conv_params.fwd_offset,
+              &cpi->common, mbmi, &inter_pred_params.conv_params.fwd_offset,
               &inter_pred_params.conv_params.bck_offset,
               &inter_pred_params.conv_params.use_dist_wtd_comp_avg, 1);
           int mask_value = inter_pred_params.conv_params.fwd_offset * 4;
@@ -1371,7 +1373,7 @@
       int_mv tmp_mv[2] = { mbmi->mv[0], mbmi->mv[1] };
       int best_rs2 = 0;
       int best_rate_mv = *rate_mv;
-      const int wedge_mask_size = get_wedge_types_lookup(bsize);
+      int wedge_mask_size = get_wedge_types_lookup(bsize);
       int need_mask_search = args->wedge_index == -1;
 
       if (need_mask_search && !have_newmv_in_inter_mode(this_mode)) {
@@ -1428,6 +1430,33 @@
             best_rs2 = rs2;
           }
         }
+        // Consider the asymmetric partitions for oblique angle only if the
+        // corresponding symmetric partition is the best so far.
+        // Note: For horizontal and vertical types, both symmetric and
+        // asymmetric partitions are always considered.
+        if (cpi->sf.inter_sf.enable_fast_wedge_mask_search) {
+          // The first 4 entries in wedge_codebook_16_heqw/hltw/hgtw[16]
+          // correspond to symmetric partitions of the 4 oblique angles, the
+          // next 4 entries correspond to the vertical/horizontal
+          // symmetric/asymmetric partitions and the last 8 entries correspond
+          // to the asymmetric partitions of oblique types.
+          const int idx_before_asym_oblique = 7;
+          const int last_oblique_sym_idx = 3;
+          if (wedge_mask == idx_before_asym_oblique) {
+            if (best_mask_index > last_oblique_sym_idx) {
+              break;
+            } else {
+              // Asymmetric (Index-1) map for the corresponding oblique masks.
+              // WEDGE_OBLIQUE27: sym - 0, asym - 8, 9
+              // WEDGE_OBLIQUE63: sym - 1, asym - 12, 13
+              // WEDGE_OBLIQUE117: sym - 2, asym - 14, 15
+              // WEDGE_OBLIQUE153: sym - 3, asym - 10, 11
+              const int asym_mask_idx[4] = { 7, 11, 13, 9 };
+              wedge_mask = asym_mask_idx[best_mask_index];
+              wedge_mask_size = wedge_mask + 3;
+            }
+          }
+        }
       }
 
       if (need_mask_search) {

diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index 566576e..1407130 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c

@@ -13,10 +13,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/rd.h"
 
-static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
-  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
-};
-
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx) {
   dst_ctx->mic = src_ctx->mic;
@@ -41,16 +37,22 @@
   dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
 }
 
-void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
-                                   PC_TREE_SHARED_BUFFERS *shared_bufs) {
-  for (int i = 0; i < 3; i++) {
-    const int max_num_pix = MAX_SB_SIZE * MAX_SB_SIZE;
-    CHECK_MEM_ERROR(cm, shared_bufs->coeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, shared_bufs->qcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
-    CHECK_MEM_ERROR(cm, shared_bufs->dqcoeff_buf[i],
-                    aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error) {
+  const int num_planes = seq_params->monochrome ? 1 : MAX_MB_PLANE;
+  const int max_sb_square_y = 1 << num_pels_log2_lookup[seq_params->sb_size];
+  const int max_sb_square_uv = max_sb_square_y >> (seq_params->subsampling_x +
+                                                   seq_params->subsampling_y);
+  for (int i = 0; i < num_planes; i++) {
+    const int max_num_pix =
+        (i == AOM_PLANE_Y) ? max_sb_square_y : max_sb_square_uv;
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->coeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->qcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
+    AOM_CHECK_MEM_ERROR(error, shared_bufs->dqcoeff_buf[i],
+                        aom_memalign(32, max_num_pix * sizeof(tran_low_t)));
   }
 }
 
@@ -219,20 +221,12 @@
   if (!keep_best && !keep_none) aom_free(pc_tree);
 }
 
-static AOM_INLINE int get_pc_tree_nodes(const int is_sb_size_128,
-                                        int stat_generation_stage) {
-  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
-  const int tree_nodes =
-      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
-  return tree_nodes;
-}
-
 void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) {
   AV1_COMMON *const cm = &cpi->common;
   const int stat_generation_stage = is_stat_generation_stage(cpi);
-  const int is_sb_size_128 = cm->seq_params.sb_size == BLOCK_128X128;
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
   const int tree_nodes =
-      get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
   int sms_tree_index = 0;
   SIMPLE_MOTION_DATA_TREE *this_sms;
   int square_index = 1;

diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index 484a1b3..ef74e56 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h

@@ -21,6 +21,7 @@
 extern "C" {
 #endif
 
+struct AV1_PRIMARY;
 struct AV1_COMP;
 struct AV1Common;
 struct ThreadData;
@@ -101,8 +102,9 @@
   int sms_rect_valid;
 } SIMPLE_MOTION_DATA_TREE;
 
-void av1_setup_shared_coeff_buffer(AV1_COMMON *cm,
-                                   PC_TREE_SHARED_BUFFERS *shared_bufs);
+void av1_setup_shared_coeff_buffer(const SequenceHeader *const seq_params,
+                                   PC_TREE_SHARED_BUFFERS *shared_bufs,
+                                   struct aom_internal_error_info *error);
 void av1_free_shared_coeff_buffer(PC_TREE_SHARED_BUFFERS *shared_bufs);
 
 PC_TREE *av1_alloc_pc_tree_node(BLOCK_SIZE bsize);
@@ -116,6 +118,18 @@
 void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
                            PICK_MODE_CONTEXT *src_ctx);
 
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+  BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static AOM_INLINE int av1_get_pc_tree_nodes(const int is_sb_size_128,
+                                            int stat_generation_stage) {
+  const int tree_nodes_inc = is_sb_size_128 ? 1024 : 0;
+  const int tree_nodes =
+      stat_generation_stage ? 1 : (tree_nodes_inc + 256 + 64 + 16 + 4 + 1);
+  return tree_nodes;
+}
+
 void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td);
 void av1_free_sms_tree(struct ThreadData *td);
 

diff --git a/av1/encoder/corner_match.c b/av1/encoder/corner_match.c
index c2f1b2e..3631be9 100644
--- a/av1/encoder/corner_match.c
+++ b/av1/encoder/corner_match.c

@@ -15,7 +15,6 @@
 
 #include "config/av1_rtcd.h"
 
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 #define SEARCH_SZ 9
@@ -66,7 +65,6 @@
     }
   var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   cov = cross * MATCH_SZ_SQ - sum1 * sum2;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }
 

diff --git a/av1/encoder/dwt.c b/av1/encoder/dwt.c
index b5ed4a3..5dfbcb6 100644
--- a/av1/encoder/dwt.c
+++ b/av1/encoder/dwt.c

@@ -147,9 +147,23 @@
   return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
 }
 
-int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd) {
+static int haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride,
+                                       int hbd) {
   tran_low_t output[64];
 
   av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
   return av1_haar_ac_sad(output, 8, 8, 8);
 }
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols) {
+  int64_t wavelet_energy = 0;
+  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
+    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
+      wavelet_energy += haar_ac_sad_8x8_uint8_input(
+          input + c8 * 8 + r8 * 8 * stride, stride, hbd);
+    }
+  }
+  return wavelet_energy;
+}

diff --git a/av1/encoder/dwt.h b/av1/encoder/dwt.h
index 1bd32ed..443b6bc 100644
--- a/av1/encoder/dwt.h
+++ b/av1/encoder/dwt.h

@@ -19,6 +19,9 @@
 
 void av1_fdwt8x8_uint8_input_c(const uint8_t *input, tran_low_t *output,
                                int stride, int hbd);
-int av1_haar_ac_sad_8x8_uint8_input(const uint8_t *input, int stride, int hbd);
+
+int64_t av1_haar_ac_sad_mxn_uint8_input(const uint8_t *input, int stride,
+                                        int hbd, int num_8x8_rows,
+                                        int num_8x8_cols);
 
 #endif  // AOM_AV1_ENCODER_DWT_H_

diff --git a/av1/encoder/enc_enums.h b/av1/encoder/enc_enums.h
index 319e5d0..20cefa1 100644
--- a/av1/encoder/enc_enums.h
+++ b/av1/encoder/enc_enums.h

@@ -216,6 +216,8 @@
   NUM_SINGLE_REF_MODES = SINGLE_REF_MODE_END - SINGLE_REF_MODE_START,
   THR_MODE_START = THR_NEARESTMV,
   THR_MODE_END = MAX_MODES,
+  THR_INTER_MODE_START = THR_MODE_START,
+  THR_INTER_MODE_END = THR_DC,
   THR_INVALID = 255
 } UENUM1BYTE(THR_MODES);
 

diff --git a/av1/encoder/encode_strategy.c b/av1/encoder/encode_strategy.c
index 9f68195..fe2b9ae 100644
--- a/av1/encoder/encode_strategy.c
+++ b/av1/encoder/encode_strategy.c

@@ -18,8 +18,6 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #if CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
 #endif  // CONFIG_MISMATCH_DEBUG
@@ -42,78 +40,84 @@
 #define TEMPORAL_FILTER_KEY_FRAME (CONFIG_REALTIME_ONLY ? 0 : 1)
 
 static INLINE void set_refresh_frame_flags(
-    RefreshFrameFlagsInfo *const refresh_frame_flags, bool refresh_gf,
-    bool refresh_bwdref, bool refresh_arf) {
-  refresh_frame_flags->golden_frame = refresh_gf;
-  refresh_frame_flags->bwd_ref_frame = refresh_bwdref;
-  refresh_frame_flags->alt_ref_frame = refresh_arf;
+    RefreshFrameInfo *const refresh_frame, bool refresh_gf, bool refresh_bwdref,
+    bool refresh_arf) {
+  refresh_frame->golden_frame = refresh_gf;
+  refresh_frame->bwd_ref_frame = refresh_bwdref;
+  refresh_frame->alt_ref_frame = refresh_arf;
 }
 
-void av1_configure_buffer_updates(
-    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
-    int force_refresh_all) {
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all) {
   // NOTE(weitinglin): Should we define another function to take care of
   // cpi->rc.is_$Source_Type to make this function as it is in the comment?
-
   const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
       &cpi->ext_flags.refresh_frame;
   cpi->rc.is_src_frame_alt_ref = 0;
 
   switch (type) {
     case KF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+      set_refresh_frame_flags(refresh_frame, true, true, true);
       break;
 
     case LF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       break;
 
     case GF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, true, false, false);
+      set_refresh_frame_flags(refresh_frame, true, false, false);
       break;
 
     case OVERLAY_UPDATE:
-      if (frame_type == KEY_FRAME && cpi->rc.frames_to_key == 0) {
-        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
-      } else {
-        set_refresh_frame_flags(refresh_frame_flags, true, false, false);
-      }
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, true, false, false);
+
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case ARF_UPDATE:
       // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
-      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
-        // TODO(bohanli): consider moving this to force_refresh_all?
-        // This is Keyframe as arf
-        set_refresh_frame_flags(refresh_frame_flags, true, true, true);
-      } else {
-        set_refresh_frame_flags(refresh_frame_flags, false, false, true);
-      }
+      if (refbuf_state == REFBUF_RESET)
+        set_refresh_frame_flags(refresh_frame, true, true, true);
+      else
+        set_refresh_frame_flags(refresh_frame, false, false, true);
+
       break;
 
     case INTNL_OVERLAY_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, false, false);
+      set_refresh_frame_flags(refresh_frame, false, false, false);
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
 
     case INTNL_ARF_UPDATE:
-      set_refresh_frame_flags(refresh_frame_flags, false, true, false);
+      set_refresh_frame_flags(refresh_frame, false, true, false);
       break;
 
     default: assert(0); break;
   }
 
   if (ext_refresh_frame_flags->update_pending &&
-      (!is_stat_generation_stage(cpi)))
-    set_refresh_frame_flags(refresh_frame_flags,
+      (!is_stat_generation_stage(cpi))) {
+    set_refresh_frame_flags(refresh_frame,
                             ext_refresh_frame_flags->golden_frame,
                             ext_refresh_frame_flags->bwd_ref_frame,
                             ext_refresh_frame_flags->alt_ref_frame);
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
+    if (ext_refresh_frame_flags->golden_frame)
+      gf_group->update_type[cpi->gf_frame_index] = GF_UPDATE;
+    if (ext_refresh_frame_flags->alt_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = ARF_UPDATE;
+    if (ext_refresh_frame_flags->bwd_ref_frame)
+      gf_group->update_type[cpi->gf_frame_index] = INTNL_ARF_UPDATE;
+  }
 
   if (force_refresh_all)
-    set_refresh_frame_flags(refresh_frame_flags, true, true, true);
+    set_refresh_frame_flags(refresh_frame, true, true, true);
 }
 
 static void set_additional_frame_flags(const AV1_COMMON *const cm,
@@ -129,54 +133,6 @@
   }
 }
 
-static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
-  if (cpi->common.show_frame && cpi->rc.frames_to_key) {
-    cpi->rc.frames_since_key++;
-    cpi->rc.frames_to_key--;
-  }
-}
-
-static INLINE int is_frame_droppable(
-    const SVC *const svc,
-    const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
-  // Droppable frame is only used by external refresh flags. VoD setting won't
-  // trigger its use case.
-  if (svc->set_ref_frame_config)
-    return svc->non_reference_frame;
-  else if (ext_refresh_frame_flags->update_pending)
-    return !(ext_refresh_frame_flags->alt_ref_frame ||
-             ext_refresh_frame_flags->alt2_ref_frame ||
-             ext_refresh_frame_flags->bwd_ref_frame ||
-             ext_refresh_frame_flags->golden_frame ||
-             ext_refresh_frame_flags->last_frame);
-  else
-    return 0;
-}
-
-static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
-  // TODO(weitinglin): Updating this counter for is_frame_droppable
-  // is a work-around to handle the condition when a frame is drop.
-  // We should fix the cpi->common.show_frame flag
-  // instead of checking the other condition to update the counter properly.
-  if (cpi->common.show_frame ||
-      is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) {
-    // Decrement count down till next gf
-    if (cpi->rc.frames_till_gf_update_due > 0)
-      cpi->rc.frames_till_gf_update_due--;
-  }
-}
-
-static INLINE void update_gf_group_index(AV1_COMP *cpi) {
-  // Increment the gf group index ready for the next frame.
-  ++cpi->gf_frame_index;
-}
-
-static void update_rc_counts(AV1_COMP *cpi) {
-  update_keyframe_counters(cpi);
-  update_frames_till_gf_update(cpi);
-  update_gf_group_index(cpi);
-}
-
 static void set_ext_overrides(AV1_COMMON *const cm,
                               EncodeFrameParams *const frame_params,
                               ExternalFlags *const ext_flags) {
@@ -204,27 +160,6 @@
   frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
 }
 
-static int get_current_frame_ref_type(
-    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
-  // We choose the reference "type" of this frame from the flags which indicate
-  // which reference frames will be refreshed by it.  More than one  of these
-  // flags may be set, so the order here implies an order of precedence. This is
-  // just used to choose the primary_ref_frame (as the most recent reference
-  // buffer of the same reference-type as the current frame)
-
-  (void)frame_params;
-  // TODO(jingning): This table should be a lot simpler with the new
-  // ARF system in place. Keep frame_params for the time being as we are
-  // still evaluating a few design options.
-  switch (cpi->gf_group.layer_depth[cpi->gf_frame_index]) {
-    case 0: return 0;
-    case 1: return 1;
-    case MAX_ARF_LAYERS:
-    case MAX_ARF_LAYERS + 1: return 4;
-    default: return 7;
-  }
-}
-
 static int choose_primary_ref_frame(
     const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
   const AV1_COMMON *const cm = &cpi->common;
@@ -238,16 +173,16 @@
 
   // In large scale case, always use Last frame's frame contexts.
   // Note(yunqing): In other cases, primary_ref_frame is chosen based on
-  // cpi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
+  // cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index], which also controls
   // frame bit allocation.
   if (cm->tiles.large_scale) return (LAST_FRAME - LAST_FRAME);
 
-  if (cpi->use_svc) return av1_svc_primary_ref_frame(cpi);
+  if (cpi->ppi->use_svc) return av1_svc_primary_ref_frame(cpi);
 
   // Find the most recent reference frame with the same reference type as the
   // current frame
-  const int current_ref_type = get_current_frame_ref_type(cpi, frame_params);
-  int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+  const int current_ref_type = get_current_frame_ref_type(cpi);
+  int wanted_fb = cpi->ppi->fb_of_context_type[current_ref_type];
 
   int primary_ref_frame = PRIMARY_REF_NONE;
   for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
@@ -259,51 +194,14 @@
   return primary_ref_frame;
 }
 
-static void update_fb_of_context_type(
-    const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
-    int *const fb_of_context_type) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const int current_frame_ref_type =
-      get_current_frame_ref_type(cpi, frame_params);
-
-  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
-      cpi->ext_flags.use_primary_ref_none) {
-    for (int i = 0; i < REF_FRAMES; i++) {
-      fb_of_context_type[i] = -1;
-    }
-    fb_of_context_type[current_frame_ref_type] =
-        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
-                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
-  }
-
-  if (!encode_show_existing_frame(cm)) {
-    // Refresh fb_of_context_type[]: see encoder.h for explanation
-    if (cm->current_frame.frame_type == KEY_FRAME) {
-      // All ref frames are refreshed, pick one that will live long enough
-      fb_of_context_type[current_frame_ref_type] = 0;
-    } else {
-      // If more than one frame is refreshed, it doesn't matter which one we
-      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
-
-      for (int i = 0; i < REF_FRAMES; i++) {
-        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
-          fb_of_context_type[current_frame_ref_type] = i;
-          break;
-        }
-      }
-    }
-  }
-}
-
 static void adjust_frame_rate(AV1_COMP *cpi, int64_t ts_start, int64_t ts_end) {
   TimeStamps *time_stamps = &cpi->time_stamps;
   int64_t this_duration;
   int step = 0;
 
   // Clear down mmx registers
-  aom_clear_system_state();
 
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+  if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) {
     cpi->framerate = cpi->svc.base_framerate;
     av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
     return;
@@ -325,8 +223,12 @@
 
   if (this_duration) {
     if (step) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->new_framerate = 10000000.0 / this_duration;
+#endif
       av1_new_framerate(cpi, 10000000.0 / this_duration);
     } else {
+      double framerate;
       // Average this frame's rate into the last second's average
       // frame rate. If we haven't seen 1 second yet, then average
       // over the whole interval seen.
@@ -335,10 +237,21 @@
       double avg_duration = 10000000.0 / cpi->framerate;
       avg_duration *= (interval - avg_duration + this_duration);
       avg_duration /= interval;
-
-      av1_new_framerate(cpi, 10000000.0 / avg_duration);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->new_framerate = (10000000.0 / avg_duration);
+      // For parallel frames update cpi->framerate with new_framerate
+      // during av1_post_encode_updates()
+      framerate =
+          (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+              ? cpi->framerate
+              : cpi->new_framerate;
+#else
+      framerate = (10000000.0 / avg_duration);
+#endif
+      av1_new_framerate(cpi, framerate);
     }
   }
+
   time_stamps->prev_ts_start = ts_start;
   time_stamps->prev_ts_end = ts_end;
 }
@@ -372,7 +285,7 @@
     struct lookahead_entry **last_source,
     EncodeFrameParams *const frame_params) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   struct lookahead_entry *source = NULL;
 
   // Source index in lookahead buffer.
@@ -382,7 +295,7 @@
   if (src_index &&
       (is_forced_keyframe_pending(cpi->ppi->lookahead, src_index,
                                   cpi->compressor_stage) != -1) &&
-      cpi->oxcf.rc_cfg.mode != AOM_Q) {
+      cpi->oxcf.rc_cfg.mode != AOM_Q && !is_stat_generation_stage(cpi)) {
     src_index = 0;
     *flush = 1;
   }
@@ -404,24 +317,37 @@
       *pop_lookahead = 0;
     }
   }
+
+  // LAP stage does not have ARFs or forward key-frames,
+  // hence, always pop_lookahead here.
+  if (is_stat_generation_stage(cpi)) {
+    *pop_lookahead = 1;
+    src_index = 0;
+  }
+
   frame_params->show_frame = *pop_lookahead;
-  if (*pop_lookahead) {
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Future frame in parallel encode set
+  if (gf_group->src_offset[cpi->gf_frame_index] != 0 &&
+      !is_stat_generation_stage(cpi)) {
+    src_index = gf_group->src_offset[cpi->gf_frame_index];
+  }
+#endif
+  if (frame_params->show_frame) {
     // show frame, pop from buffer
     // Get last frame source.
     if (cm->current_frame.frame_number > 0) {
-      *last_source =
-          av1_lookahead_peek(cpi->ppi->lookahead, -1, cpi->compressor_stage);
+      *last_source = av1_lookahead_peek(cpi->ppi->lookahead, src_index - 1,
+                                        cpi->compressor_stage);
     }
     // Read in the source frame.
-    source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
+    source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
+                                cpi->compressor_stage);
   } else {
     // no show frames are arf frames
     source = av1_lookahead_peek(cpi->ppi->lookahead, src_index,
                                 cpi->compressor_stage);
-    // When src_index == rc->frames_to_key, it indicates a fwd_kf
-    if (src_index == cpi->rc.frames_to_key && src_index != 0) {
-      cpi->no_show_fwd_kf = 1;
-    }
     if (source != NULL) {
       cm->showable_frame = 1;
     }
@@ -452,10 +378,9 @@
 
 // Update frame_flags to tell the encoder's caller what sort of frame was
 // encoded.
-static void update_frame_flags(
-    const AV1_COMMON *const cm,
-    const RefreshFrameFlagsInfo *const refresh_frame_flags,
-    unsigned int *frame_flags) {
+static void update_frame_flags(const AV1_COMMON *const cm,
+                               const RefreshFrameInfo *const refresh_frame,
+                               unsigned int *frame_flags) {
   if (encode_show_existing_frame(cm)) {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
@@ -464,19 +389,19 @@
     return;
   }
 
-  if (refresh_frame_flags->golden_frame) {
+  if (refresh_frame->golden_frame) {
     *frame_flags |= FRAMEFLAGS_GOLDEN;
   } else {
     *frame_flags &= ~FRAMEFLAGS_GOLDEN;
   }
 
-  if (refresh_frame_flags->alt_ref_frame) {
+  if (refresh_frame->alt_ref_frame) {
     *frame_flags |= FRAMEFLAGS_ALTREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_ALTREF;
   }
 
-  if (refresh_frame_flags->bwd_ref_frame) {
+  if (refresh_frame->bwd_ref_frame) {
     *frame_flags |= FRAMEFLAGS_BWDREF;
   } else {
     *frame_flags &= ~FRAMEFLAGS_BWDREF;
@@ -586,24 +511,20 @@
 }
 
 // Update reference frame stack info.
-void av1_update_ref_frame_map(AV1_COMP *cpi,
+void av1_update_ref_frame_map(const AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              FRAME_TYPE frame_type, int show_existing_frame,
-                              int ref_map_index,
+                              REFBUF_STATE refbuf_state, int ref_map_index,
                               RefBufferStack *ref_buffer_stack) {
-  AV1_COMMON *const cm = &cpi->common;
+  const AV1_COMMON *const cm = &cpi->common;
+
   // TODO(jingning): Consider the S-frame same as key frame for the
   // reference frame tracking purpose. The logic might be better
   // expressed than converting the frame update type.
-  if (frame_is_sframe(cm)) frame_update_type = KEY_FRAME;
-
+  if (frame_is_sframe(cm)) frame_update_type = KF_UPDATE;
   if (is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) return;
 
   switch (frame_update_type) {
-    case KEY_FRAME:
-      if (show_existing_frame)
-        ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
-                                  &ref_buffer_stack->arf_stack_size);
+    case KF_UPDATE:
       stack_reset(ref_buffer_stack->lst_stack,
                   &ref_buffer_stack->lst_stack_size);
       stack_reset(ref_buffer_stack->gld_stack,
@@ -618,6 +539,8 @@
       stack_push(ref_buffer_stack->gld_stack, &ref_buffer_stack->gld_stack_size,
                  ref_map_index);
       // For nonrd_mode: update LAST as well on GF_UPDATE frame.
+      // TODO(jingning, marpan): Why replacing both reference frames with the
+      // same decoded frame?
       if (cpi->sf.rt_sf.use_nonrd_pick_mode)
         stack_push(ref_buffer_stack->lst_stack,
                    &ref_buffer_stack->lst_stack_size, ref_map_index);
@@ -629,7 +552,7 @@
       break;
     case ARF_UPDATE:
     case INTNL_ARF_UPDATE:
-      if (frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+      if (refbuf_state == REFBUF_RESET) {
         stack_reset(ref_buffer_stack->lst_stack,
                     &ref_buffer_stack->lst_stack_size);
         stack_reset(ref_buffer_stack->gld_stack,
@@ -643,7 +566,7 @@
                  ref_map_index);
       break;
     case OVERLAY_UPDATE:
-      if (frame_type == KEY_FRAME) {
+      if (refbuf_state == REFBUF_RESET) {
         ref_map_index = stack_pop(ref_buffer_stack->arf_stack,
                                   &ref_buffer_stack->arf_stack_size);
         stack_reset(ref_buffer_stack->lst_stack,
@@ -677,7 +600,17 @@
   return;
 }
 
-static int get_free_ref_map_index(const RefBufferStack *ref_buffer_stack) {
+static int get_free_ref_map_index(
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    RefFrameMapPair ref_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    const RefBufferStack *ref_buffer_stack) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  (void)ref_buffer_stack;
+  for (int idx = 0; idx < REF_FRAMES; ++idx)
+    if (ref_map_pairs[idx].disp_order == -1) return idx;
+  return INVALID_IDX;
+#else
   for (int idx = 0; idx < REF_FRAMES; ++idx) {
     int is_free = 1;
     for (int i = 0; i < ref_buffer_stack->arf_stack_size; ++i) {
@@ -704,30 +637,130 @@
     if (is_free) return idx;
   }
   return INVALID_IDX;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+static int get_refresh_idx(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                           int update_arf,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                           GF_GROUP *gf_group, int gf_index,
+                           int enable_refresh_skip,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                           int cur_frame_disp) {
+  int arf_count = 0;
+  int oldest_arf_order = INT32_MAX;
+  int oldest_arf_idx = -1;
+
+  int oldest_frame_order = INT32_MAX;
+  int oldest_idx = -1;
+
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    const int reference_frame_level = ref_pair.pyr_level;
+    // Do not refresh a future frame.
+    if (frame_order > cur_frame_disp) continue;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    if (enable_refresh_skip) {
+      int skip_frame = 0;
+      // Prevent refreshing a frame in gf_group->skip_frame_refresh.
+      for (int i = 0; i < REF_FRAMES; i++) {
+        int frame_to_skip = gf_group->skip_frame_refresh[gf_index][i];
+        if (frame_to_skip == INVALID_IDX) break;
+        if (frame_order == frame_to_skip) {
+          skip_frame = 1;
+          break;
+        }
+      }
+      if (skip_frame) continue;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+    // Keep track of the oldest level 1 frame if the current frame is also level
+    // 1.
+    if (reference_frame_level == 1) {
+      // If there are more than 2 level 1 frames in the reference list,
+      // discard the oldest.
+      if (frame_order < oldest_arf_order) {
+        oldest_arf_order = frame_order;
+        oldest_arf_idx = map_idx;
+      }
+      arf_count++;
+      continue;
+    }
+
+    // Update the overall oldest reference frame.
+    if (frame_order < oldest_frame_order) {
+      oldest_frame_order = frame_order;
+      oldest_idx = map_idx;
+    }
+  }
+  if (update_arf && arf_count > 2) return oldest_arf_idx;
+  if (oldest_idx >= 0) return oldest_idx;
+  if (oldest_arf_idx >= 0) return oldest_arf_idx;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (oldest_idx == -1) {
+    assert(arf_count > 2 && enable_refresh_skip);
+    return oldest_arf_idx;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+  assert(0 && "No valid refresh index found");
+  return -1;
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+// Computes the reference refresh index for INTNL_ARF_UPDATE frame.
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index) {
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+
+  // Search for the open slot to store the current frame.
+  int free_fb_index = get_free_ref_map_index(ref_frame_map_pairs, NULL);
+
+  // Use a free slot if available.
+  if (free_fb_index != INVALID_IDX) {
+    return free_fb_index;
+  } else {
+    int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+    int refresh_idx =
+        get_refresh_idx(ref_frame_map_pairs, 0, gf_group, gf_index,
+                        enable_refresh_skip, gf_group->display_idx[gf_index]);
+    return refresh_idx;
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
                                 FRAME_UPDATE_TYPE frame_update_type,
+                                int gf_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                int cur_disp_order,
+                                RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
                                 const RefBufferStack *const ref_buffer_stack) {
   const AV1_COMMON *const cm = &cpi->common;
   const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags =
       &cpi->ext_flags.refresh_frame;
 
-  const SVC *const svc = &cpi->svc;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->refbuf_state[gf_index] == REFBUF_RESET)
+    return SELECT_ALL_BUF_SLOTS;
+
+  // TODO(jingning): Deprecate the following operations.
   // Switch frames and shown key-frames overwrite all reference slots
-  if ((frame_params->frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) ||
-      frame_params->frame_type == S_FRAME)
-    return 0xFF;
+  if (frame_params->frame_type == S_FRAME) return SELECT_ALL_BUF_SLOTS;
 
   // show_existing_frames don't actually send refresh_frame_flags so set the
   // flags to 0 to keep things consistent.
-  if (frame_params->show_existing_frame &&
-      (!frame_params->error_resilient_mode ||
-       frame_params->frame_type == KEY_FRAME)) {
-    return 0;
-  }
+  if (frame_params->show_existing_frame) return 0;
 
+  const SVC *const svc = &cpi->svc;
   if (is_frame_droppable(svc, ext_refresh_frame_flags)) return 0;
 
   int refresh_mask = 0;
@@ -777,7 +810,36 @@
   }
 
   // Search for the open slot to store the current frame.
-  int free_fb_index = get_free_ref_map_index(ref_buffer_stack);
+  int free_fb_index = get_free_ref_map_index(
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      ref_buffer_stack);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // No refresh necessary for these frame types.
+  if (frame_update_type == OVERLAY_UPDATE ||
+      frame_update_type == INTNL_OVERLAY_UPDATE)
+    return refresh_mask;
+
+  // If there is an open slot, refresh that one instead of replacing a
+  // reference.
+  if (free_fb_index != INVALID_IDX) {
+    refresh_mask = 1 << free_fb_index;
+    return refresh_mask;
+  }
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  const int enable_refresh_skip = !is_one_pass_rt_params(cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+  const int update_arf = frame_update_type == ARF_UPDATE;
+  const int refresh_idx =
+      get_refresh_idx(ref_frame_map_pairs, update_arf,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                      &cpi->ppi->gf_group, gf_index, enable_refresh_skip,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                      cur_disp_order);
+  return 1 << refresh_idx;
+#else
   switch (frame_update_type) {
     case KF_UPDATE:
     case GF_UPDATE:
@@ -843,6 +905,7 @@
   }
 
   return refresh_mask;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
 #if !CONFIG_REALTIME_ONLY
@@ -852,10 +915,10 @@
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params.sb_size);
+  av1_setup_src_planes(x, src, 0, 0, num_planes, cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 
   set_mi_offsets(&cm->mi_params, xd, 0, 0);
 }
@@ -872,9 +935,9 @@
 #endif
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   FRAME_UPDATE_TYPE update_type =
-      get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
   // Decide whether to apply temporal filtering to the source frame.
   int apply_filtering = 0;
@@ -888,7 +951,7 @@
         oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1;
     if (allow_kf_filtering) {
       const double y_noise_level = av1_estimate_noise_from_single_plane(
-          frame_input->source, 0, cm->seq_params.bit_depth);
+          frame_input->source, 0, cm->seq_params->bit_depth);
       apply_filtering = y_noise_level > 0;
     } else {
       apply_filtering = 0;
@@ -901,6 +964,9 @@
     // ARF
     apply_filtering = oxcf->algo_cfg.arnr_max_frames > 0;
   }
+  if (is_stat_generation_stage(cpi)) {
+    apply_filtering = 0;
+  }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   if (cpi->oxcf.pass == 2) start_timing(cpi, apply_filtering_time);
@@ -914,23 +980,23 @@
     cm->current_frame.frame_type = frame_params->frame_type;
     int arf_src_index = gf_group->arf_src_offset[cpi->gf_frame_index];
     int is_forward_keyframe = 0;
-    if (!frame_params->show_frame && cpi->no_show_fwd_kf) {
-      // TODO(angiebird): Figure out why this condition yields forward keyframe.
-      // fwd kf
+    if (gf_group->frame_type[cpi->gf_frame_index] == KEY_FRAME &&
+        gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_UPDATE)
       is_forward_keyframe = 1;
-    }
-    const int code_arf =
-        av1_temporal_filter(cpi, arf_src_index, update_type,
-                            is_forward_keyframe, &show_existing_alt_ref);
+
+    const int code_arf = av1_temporal_filter(
+        cpi, arf_src_index, update_type, is_forward_keyframe,
+        &show_existing_alt_ref, &cpi->ppi->alt_ref_buffer);
     if (code_arf) {
-      aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
-      frame_input->source = &cpi->alt_ref_buffer;
+      aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer, av1_num_planes(cm));
+      frame_input->source = &cpi->ppi->alt_ref_buffer;
       aom_copy_metadata_to_frame_buffer(frame_input->source,
                                         source_buffer->metadata);
     }
     // Currently INTNL_ARF_UPDATE only do show_existing.
-    if (update_type == ARF_UPDATE && !cpi->no_show_fwd_kf) {
-      cpi->show_existing_alt_ref = show_existing_alt_ref;
+    if (update_type == ARF_UPDATE &&
+        gf_group->frame_type[cpi->gf_frame_index] != KEY_FRAME) {
+      cpi->ppi->show_existing_alt_ref = show_existing_alt_ref;
     }
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -941,10 +1007,13 @@
   int allow_tpl = oxcf->gf_cfg.lag_in_frames > 1 &&
                   !is_stat_generation_stage(cpi) &&
                   oxcf->algo_cfg.enable_tpl_model;
+
+  if (gf_group->size > MAX_LENGTH_TPL_FRAME_STATS) {
+    allow_tpl = 0;
+  }
   if (frame_params->frame_type == KEY_FRAME) {
     // Don't do tpl for fwd key frames or fwd key frame overlays
     allow_tpl = allow_tpl && !cpi->sf.tpl_sf.disable_filtered_key_tpl &&
-                !cpi->no_show_fwd_kf &&
                 gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE;
   } else {
     // Do tpl after ARF is filtered, or if no ARF, at the second frame of GF
@@ -961,12 +1030,26 @@
     }
   }
 
+#if CONFIG_RD_COMMAND
+  if (frame_params->frame_type == KEY_FRAME) {
+    char filepath[] = "rd_command.txt";
+    av1_read_rd_command(filepath, &cpi->rd_command);
+  }
+#endif  // CONFIG_RD_COMMAND
   if (allow_tpl == 0) {
     // Avoid the use of unintended TPL stats from previous GOP's results.
-    if (cpi->gf_frame_index == 0) av1_init_tpl_stats(&cpi->tpl_data);
+    if (cpi->gf_frame_index == 0 && !is_stat_generation_stage(cpi))
+      av1_init_tpl_stats(&cpi->ppi->tpl_data);
   } else {
-    if (!cpi->tpl_data.skip_tpl_setup_stats)
+    if (!cpi->skip_tpl_setup_stats) {
+      av1_tpl_preload_rc_estimate(cpi, frame_params);
       av1_tpl_setup_stats(cpi, 0, frame_params, frame_input);
+#if CONFIG_BITRATE_ACCURACY
+      av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                     gf_group, cpi->gf_frame_index,
+                                     cm->seq_params->bit_depth);
+#endif
+    }
   }
 
   if (av1_encode(cpi, dest, frame_input, frame_params, frame_results) !=
@@ -988,6 +1071,7 @@
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+#if !CONFIG_FRAME_PARALLEL_ENCODE
 static INLINE int find_unused_ref_frame(const int *used_ref_frames,
                                         const int *stack, int stack_size) {
   for (int i = 0; i < stack_size; ++i) {
@@ -1003,13 +1087,303 @@
 
   return INVALID_IDX;
 }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack) {
-  AV1_COMMON *cm = &cpi->common;
-  int *const remapped_ref_idx = cm->remapped_ref_idx;
-  int *const arf_stack = ref_buffer_stack->arf_stack;
-  int *const lst_stack = ref_buffer_stack->lst_stack;
-  int *const gld_stack = ref_buffer_stack->gld_stack;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+/*!\cond */
+// Struct to keep track of relevant reference frame data.
+typedef struct {
+  int map_idx;
+  int disp_order;
+  int pyr_level;
+  int used;
+} RefBufMapData;
+/*!\endcond */
+
+// Comparison function to sort reference frames in ascending display order.
+static int compare_map_idx_pair_asc(const void *a, const void *b) {
+  if (((RefBufMapData *)a)->disp_order == ((RefBufMapData *)b)->disp_order) {
+    return 0;
+  } else if (((const RefBufMapData *)a)->disp_order >
+             ((const RefBufMapData *)b)->disp_order) {
+    return 1;
+  } else {
+    return -1;
+  }
+}
+
+// Checks to see if a particular reference frame is already in the reference
+// frame map.
+static int is_in_ref_map(RefBufMapData *map, int disp_order, int n_frames) {
+  for (int i = 0; i < n_frames; i++) {
+    if (disp_order == map[i].disp_order) return 1;
+  }
+  return 0;
+}
+
+// Add a reference buffer index to a named reference slot.
+static void add_ref_to_slot(RefBufMapData *ref, int *const remapped_ref_idx,
+                            int frame) {
+  remapped_ref_idx[frame - LAST_FRAME] = ref->map_idx;
+  ref->used = 1;
+}
+
+// Threshold dictating when we are allowed to start considering
+// leaving lowest level frames unmapped.
+#define LOW_LEVEL_FRAMES_TR 5
+
+// Find which reference buffer should be left out of the named mapping.
+// This is because there are 8 reference buffers and only 7 named slots.
+static void set_unmapped_ref(RefBufMapData *buffer_map, int n_bufs,
+                             int n_min_level_refs, int min_level,
+                             int cur_frame_disp) {
+  int max_dist = 0;
+  int unmapped_idx = -1;
+  if (n_bufs <= ALTREF_FRAME) return;
+  for (int i = 0; i < n_bufs; i++) {
+    if (buffer_map[i].used) continue;
+    if (buffer_map[i].pyr_level != min_level ||
+        n_min_level_refs >= LOW_LEVEL_FRAMES_TR) {
+      int dist = abs(cur_frame_disp - buffer_map[i].disp_order);
+      if (dist > max_dist) {
+        max_dist = dist;
+        unmapped_idx = i;
+      }
+    }
+  }
+  assert(unmapped_idx >= 0 && "Unmapped reference not found");
+  buffer_map[unmapped_idx].used = 1;
+}
+
+static void get_ref_frames(RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                           const AV1_COMP *const cpi, int gf_index,
+                           int is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                           int cur_frame_disp,
+                           int remapped_ref_idx[REF_FRAMES]) {
+  int buf_map_idx = 0;
+
+  // Initialize reference frame mappings.
+  for (int i = 0; i < REF_FRAMES; ++i) remapped_ref_idx[i] = INVALID_IDX;
+
+  RefBufMapData buffer_map[REF_FRAMES];
+  int n_bufs = 0;
+  memset(buffer_map, 0, REF_FRAMES * sizeof(buffer_map[0]));
+  int min_level = MAX_ARF_LAYERS;
+  int max_level = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int skip_ref_unmapping = 0;
+  int is_one_pass_rt = is_one_pass_rt_params(cpi);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+  // Go through current reference buffers and store display order, pyr level,
+  // and map index.
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    RefFrameMapPair ref_pair = ref_frame_map_pairs[map_idx];
+    if (ref_pair.disp_order == -1) continue;
+    const int frame_order = ref_pair.disp_order;
+    // Avoid duplicates.
+    if (is_in_ref_map(buffer_map, frame_order, n_bufs)) continue;
+    const int reference_frame_level = ref_pair.pyr_level;
+
+    // Keep track of the lowest and highest levels that currently exist.
+    if (reference_frame_level < min_level) min_level = reference_frame_level;
+    if (reference_frame_level > max_level) max_level = reference_frame_level;
+
+    buffer_map[n_bufs].map_idx = map_idx;
+    buffer_map[n_bufs].disp_order = frame_order;
+    buffer_map[n_bufs].pyr_level = reference_frame_level;
+    buffer_map[n_bufs].used = 0;
+    n_bufs++;
+  }
+
+  // Sort frames in ascending display order.
+  qsort(buffer_map, n_bufs, sizeof(buffer_map[0]), compare_map_idx_pair_asc);
+
+  int n_min_level_refs = 0;
+  int n_past_high_level = 0;
+  int closest_past_ref = -1;
+  int golden_idx = -1;
+  int altref_idx = -1;
+
+  // Find the GOLDEN_FRAME and BWDREF_FRAME.
+  // Also collect various stats about the reference frames for the remaining
+  // mappings.
+  for (int i = n_bufs - 1; i >= 0; i--) {
+    if (buffer_map[i].pyr_level == min_level) {
+      // Keep track of the number of lowest level frames.
+      n_min_level_refs++;
+      if (buffer_map[i].disp_order < cur_frame_disp && golden_idx == -1 &&
+          remapped_ref_idx[GOLDEN_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for GOLDEN.
+        golden_idx = i;
+      } else if (buffer_map[i].disp_order > cur_frame_disp &&
+                 altref_idx == -1 &&
+                 remapped_ref_idx[ALTREF_FRAME - LAST_FRAME] == INVALID_IDX) {
+        // Save index for ALTREF.
+        altref_idx = i;
+      }
+    } else if (buffer_map[i].disp_order == cur_frame_disp) {
+      // Map the BWDREF_FRAME if this is the show_existing_frame.
+      add_ref_to_slot(&buffer_map[i], remapped_ref_idx, BWDREF_FRAME);
+    }
+
+    // Keep track of the number of past frames that are not at the lowest level.
+    if (buffer_map[i].disp_order < cur_frame_disp &&
+        buffer_map[i].pyr_level != min_level)
+      n_past_high_level++;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // During parallel encodes of lower layer frames, exclude the first frame
+    // (frame_parallel_level 1) from being used for the reference assignment of
+    // the second frame (frame_parallel_level 2).
+    if (!is_one_pass_rt && gf_group->frame_parallel_level[gf_index] == 2 &&
+        gf_group->frame_parallel_level[gf_index - 1] == 1 &&
+        gf_group->update_type[gf_index - 1] == INTNL_ARF_UPDATE) {
+      assert(gf_group->update_type[gf_index] == INTNL_ARF_UPDATE);
+
+      // If parallel cpis are active, use ref_idx_to_skip, else, use display
+      // index.
+      assert(IMPLIES(is_parallel_encode, cpi->ref_idx_to_skip != INVALID_IDX));
+      assert(IMPLIES(!is_parallel_encode,
+                     gf_group->skip_frame_as_ref[gf_index] != INVALID_IDX));
+      buffer_map[i].used = is_parallel_encode
+                               ? (buffer_map[i].map_idx == cpi->ref_idx_to_skip)
+                               : (buffer_map[i].disp_order ==
+                                  gf_group->skip_frame_as_ref[gf_index]);
+      // In case a ref frame is excluded from being used during assignment,
+      // skip the call to set_unmapped_ref(). Applicable in steady state.
+      if (buffer_map[i].used) skip_ref_unmapping = 1;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+    // Keep track of where the frames change from being past frames to future
+    // frames.
+    if (buffer_map[i].disp_order < cur_frame_disp && closest_past_ref < 0)
+      closest_past_ref = i;
+  }
+
+  // Do not map GOLDEN and ALTREF based on their pyramid level if all reference
+  // frames have the same level.
+  if (n_min_level_refs <= n_bufs) {
+    // Map the GOLDEN_FRAME.
+    if (golden_idx > -1)
+      add_ref_to_slot(&buffer_map[golden_idx], remapped_ref_idx, GOLDEN_FRAME);
+    // Map the ALTREF_FRAME.
+    if (altref_idx > -1)
+      add_ref_to_slot(&buffer_map[altref_idx], remapped_ref_idx, ALTREF_FRAME);
+  }
+
+  // Find the buffer to be excluded from the mapping.
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (!skip_ref_unmapping)
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+    set_unmapped_ref(buffer_map, n_bufs, n_min_level_refs, min_level,
+                     cur_frame_disp);
+
+  // Place past frames in LAST_FRAME, LAST2_FRAME, and LAST3_FRAME.
+  for (int frame = LAST_FRAME; frame < GOLDEN_FRAME; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in decreasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MIN;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order < cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order > next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place future frames (if there are any) in BWDREF_FRAME and ALTREF2_FRAME.
+  for (int frame = BWDREF_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer
+    // in increasing ouptut order relative to current picture.
+    int next_buf_max = 0;
+    int next_disp_order = INT_MAX;
+    for (buf_map_idx = n_bufs - 1; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used &&
+          buffer_map[buf_map_idx].disp_order > cur_frame_disp &&
+          buffer_map[buf_map_idx].disp_order < next_disp_order) {
+        next_disp_order = buffer_map[buf_map_idx].disp_order;
+        next_buf_max = buf_map_idx;
+      }
+    }
+    buf_map_idx = next_buf_max;
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining past frames.
+  buf_map_idx = closest_past_ref;
+  for (int frame = LAST_FRAME; frame < REF_FRAMES; frame++) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx >= 0; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Place remaining future frames.
+  buf_map_idx = n_bufs - 1;
+  for (int frame = ALTREF_FRAME; frame >= LAST_FRAME; frame--) {
+    // Continue if the current ref slot is already full.
+    if (remapped_ref_idx[frame - LAST_FRAME] != INVALID_IDX) continue;
+    // Find the next unmapped reference buffer.
+    for (; buf_map_idx > closest_past_ref; buf_map_idx--) {
+      if (!buffer_map[buf_map_idx].used) break;
+    }
+    if (buf_map_idx < 0) break;
+    if (buffer_map[buf_map_idx].used) break;
+    add_ref_to_slot(&buffer_map[buf_map_idx], remapped_ref_idx, frame);
+  }
+
+  // Fill any slots that are empty (should only happen for the first 7 frames).
+  for (int i = 0; i < REF_FRAMES; ++i)
+    if (remapped_ref_idx[i] == INVALID_IDX) remapped_ref_idx[i] = 0;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                        RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                        const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                        int remapped_ref_idx[REF_FRAMES]) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  (void)ref_buffer_stack;
+  get_ref_frames(ref_frame_map_pairs,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                 cpi, gf_index, is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                 cur_frame_disp, remapped_ref_idx);
+  return;
+#else
+  const int *const arf_stack = ref_buffer_stack->arf_stack;
+  const int *const lst_stack = ref_buffer_stack->lst_stack;
+  const int *const gld_stack = ref_buffer_stack->gld_stack;
   const int arf_stack_size = ref_buffer_stack->arf_stack_size;
   const int lst_stack_size = ref_buffer_stack->lst_stack_size;
   const int gld_stack_size = ref_buffer_stack->gld_stack_size;
@@ -1080,16 +1454,17 @@
       remapped_ref_idx[idx] = ref_buffer_stack->gld_stack[0];
     }
   }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 }
 
 int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
-                        int flush) {
+                        int *const pop_lookahead, int flush) {
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   ExternalFlags *const ext_flags = &cpi->ext_flags;
   GFConfig *const gf_cfg = &oxcf->gf_cfg;
 
@@ -1113,9 +1488,10 @@
 
   if (!av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage)) {
 #if !CONFIG_REALTIME_ONLY
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
+      cpi->ppi->twopass.first_pass_done = 1;
     }
 #endif
     return -1;
@@ -1129,15 +1505,26 @@
         AOMMIN(gf_cfg->gf_min_pyr_height, gf_cfg->gf_max_pyr_height);
   }
 
-  cpi->tpl_data.skip_tpl_setup_stats = 0;
+  cpi->skip_tpl_setup_stats = 0;
 #if !CONFIG_REALTIME_ONLY
-  const int use_one_pass_rt_params = has_no_stats_stage(cpi) &&
-                                     oxcf->mode == REALTIME &&
-                                     gf_cfg->lag_in_frames == 0;
+  cpi->twopass_frame.this_frame = NULL;
+  const int use_one_pass_rt_params = is_one_pass_rt_params(cpi);
   if (!use_one_pass_rt_params && !is_stat_generation_stage(cpi)) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, av1_get_second_pass_params_time);
 #endif
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Initialise frame_level_rate_correction_factors with value previous
+    // to the parallel frames.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      for (int i = 0; i < RATE_FACTOR_LEVELS; i++)
+        cpi->rc.frame_level_rate_correction_factors[i] =
+            cpi->ppi->p_rc.rate_correction_factors[i];
+    }
+    // copy mv_stats from ppi to frame_level cpi.
+    cpi->mv_stats = cpi->ppi->mv_stats;
+#endif
     av1_get_second_pass_params(cpi, &frame_params, &frame_input, *frame_flags);
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, av1_get_second_pass_params_time);
@@ -1146,15 +1533,13 @@
 #endif
 
   if (!is_stat_generation_stage(cpi)) {
-    // If this is a forward keyframe, mark as a show_existing_frame
-    // TODO(bohanli): find a consistent condition for fwd keyframes
-    if (oxcf->kf_cfg.fwd_kf_enabled &&
-        gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
-        cpi->rc.frames_to_key == 0) {
+    // TODO(jingning): fwd key frame always uses show existing frame?
+    if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE &&
+        gf_group->refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
       frame_params.show_existing_frame = 1;
     } else {
       frame_params.show_existing_frame =
-          (cpi->show_existing_alt_ref &&
+          (cpi->ppi->show_existing_alt_ref &&
            gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) ||
           gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE;
     }
@@ -1162,7 +1547,7 @@
 
     // Reset show_existing_alt_ref decision to 0 after it is used.
     if (gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
-      cpi->show_existing_alt_ref = 0;
+      cpi->ppi->show_existing_alt_ref = 0;
     }
   } else {
     frame_params.show_existing_frame = 0;
@@ -1170,25 +1555,32 @@
 
   struct lookahead_entry *source = NULL;
   struct lookahead_entry *last_source = NULL;
-  int pop_lookahead = 0;
   if (frame_params.show_existing_frame) {
     source = av1_lookahead_peek(cpi->ppi->lookahead, 0, cpi->compressor_stage);
-    pop_lookahead = 1;
+    *pop_lookahead = 1;
     frame_params.show_frame = 1;
   } else {
-    source = choose_frame_source(cpi, &flush, &pop_lookahead, &last_source,
+    source = choose_frame_source(cpi, &flush, pop_lookahead, &last_source,
                                  &frame_params);
   }
 
   if (source == NULL) {  // If no source was found, we can't encode a frame.
 #if !CONFIG_REALTIME_ONLY
-    if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+    if (flush && oxcf->pass == AOM_RC_FIRST_PASS &&
+        !cpi->ppi->twopass.first_pass_done) {
       av1_end_first_pass(cpi); /* get last stats packet */
-      cpi->twopass.first_pass_done = 1;
+      cpi->ppi->twopass.first_pass_done = 1;
     }
 #endif
     return -1;
   }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // reset src_offset to allow actual encode call for this frame to get its
+  // source.
+  gf_group->src_offset[cpi->gf_frame_index] = 0;
+#endif
+
   // Source may be changed if temporal filtered later.
   frame_input.source = &source->img;
   frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
@@ -1217,7 +1609,7 @@
           &cm->film_grain_params);
     } else {
       cm->cur_frame->film_grain_params_present =
-          cm->seq_params.film_grain_params_present;
+          cm->seq_params->film_grain_params_present;
     }
     // only one operating point supported now
     const int64_t pts64 = ticks_to_timebase_units(timestamp_ratio, *time_stamp);
@@ -1225,19 +1617,25 @@
     cm->frame_presentation_time = (uint32_t)pts64;
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
 #if CONFIG_REALTIME_ONLY
   av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-  if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
-      cm->number_temporal_layers == 1)
+  if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+      cpi->ppi->number_temporal_layers == 1)
     av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
 #else
   if (use_one_pass_rt_params) {
     av1_get_one_pass_rt_params(cpi, &frame_params, *frame_flags);
-    if (cpi->oxcf.speed >= 5 && cm->number_spatial_layers == 1 &&
-        cm->number_temporal_layers == 1)
+    if (cpi->oxcf.speed >= 5 && cpi->ppi->number_spatial_layers == 1 &&
+        cpi->ppi->number_temporal_layers == 1)
       av1_set_reference_structure_one_pass_rt(cpi, cpi->gf_frame_index == 0);
   }
 #endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, av1_get_one_pass_rt_params_time);
+#endif
 
   FRAME_UPDATE_TYPE frame_update_type =
       get_frame_update_type(gf_group, cpi->gf_frame_index);
@@ -1296,19 +1694,40 @@
        frame_params.frame_type == S_FRAME) &&
       !frame_params.show_existing_frame;
 
-  av1_configure_buffer_updates(cpi, &frame_params.refresh_frame,
-                               frame_update_type, frame_params.frame_type,
-                               force_refresh_all);
+  av1_configure_buffer_updates(
+      cpi, &frame_params.refresh_frame, frame_update_type,
+      gf_group->refbuf_state[cpi->gf_frame_index], force_refresh_all);
 
   if (!is_stat_generation_stage(cpi)) {
     const RefCntBuffer *ref_frames[INTER_REFS_PER_FRAME];
     const YV12_BUFFER_CONFIG *ref_frame_buf[INTER_REFS_PER_FRAME];
 
-    if (!ext_flags->refresh_frame.update_pending) {
-      av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
-    } else if (cpi->svc.set_ref_frame_config) {
-      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
-        cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+    init_ref_map_pair(cpi, ref_frame_map_pairs);
+    const int order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
+    const int cur_frame_disp =
+        cpi->common.current_frame.frame_number + order_offset;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (gf_group->frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+    {
+#endif
+      if (!ext_flags->refresh_frame.update_pending) {
+        av1_get_ref_frames(&cpi->ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                           cpi, cpi->gf_frame_index, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           cm->remapped_ref_idx);
+      } else if (cpi->svc.set_ref_frame_config) {
+        for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++)
+          cm->remapped_ref_idx[i] = cpi->svc.ref_idx[i];
+      }
     }
 
     // Get the reference frames
@@ -1318,22 +1737,72 @@
     }
 
     // Work out which reference frame slots may be used.
-    frame_params.ref_frame_flags = get_ref_frame_flags(
-        &cpi->sf, ref_frame_buf, ext_flags->ref_frame_flags);
+    frame_params.ref_frame_flags =
+        get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi), ref_frame_buf,
+                            ext_flags->ref_frame_flags);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Set primary_ref_frame of non-reference frames as PRIMARY_REF_NONE.
+    if (cpi->ppi->gf_group.is_frame_non_ref[cpi->gf_frame_index]) {
+      frame_params.primary_ref_frame = PRIMARY_REF_NONE;
+    } else {
+      frame_params.primary_ref_frame =
+          choose_primary_ref_frame(cpi, &frame_params);
+    }
+#else
     frame_params.primary_ref_frame =
         choose_primary_ref_frame(cpi, &frame_params);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     frame_params.order_offset = gf_group->arf_src_offset[cpi->gf_frame_index];
 
-    frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &cpi->ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // Call av1_get_refresh_frame_flags() if refresh index not available.
+    if (!cpi->refresh_idx_available) {
+#endif
+#endif
+      frame_params.refresh_frame_flags = av1_get_refresh_frame_flags(
+          cpi, &frame_params, frame_update_type, cpi->gf_frame_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+          cur_frame_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+          &cpi->ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    } else {
+      assert(cpi->ref_refresh_index != INVALID_IDX);
+      frame_params.refresh_frame_flags = (1 << cpi->ref_refresh_index);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (gf_group->is_frame_non_ref[cpi->gf_frame_index])
+      frame_params.refresh_frame_flags = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    frame_params.existing_fb_idx_to_show = INVALID_IDX;
+    // Find the frame buffer to show based on display order.
+    if (frame_params.show_existing_frame) {
+      for (int frame = 0; frame < REF_FRAMES; frame++) {
+        const RefCntBuffer *const buf = cm->ref_frame_map[frame];
+        if (buf == NULL) continue;
+        const int frame_order = (int)buf->display_order_hint;
+        if (frame_order == cur_frame_disp)
+          frame_params.existing_fb_idx_to_show = frame;
+      }
+    }
+#else
     frame_params.existing_fb_idx_to_show =
         frame_params.show_existing_frame
             ? (frame_update_type == INTNL_OVERLAY_UPDATE
                    ? get_ref_frame_map_idx(cm, BWDREF_FRAME)
                    : get_ref_frame_map_idx(cm, ALTREF_FRAME))
             : INVALID_IDX;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 
   // The way frame_params->remapped_ref_idx is setup is a placeholder.
@@ -1371,31 +1840,36 @@
   }
 #endif  // CONFIG_REALTIME_ONLY
 
+  // As the frame_update_type can get modified as part of
+  // av1_adjust_gf_refresh_qp_one_pass_rt
+  frame_update_type = get_frame_update_type(gf_group, cpi->gf_frame_index);
   if (!is_stat_generation_stage(cpi)) {
     // First pass doesn't modify reference buffer assignment or produce frame
     // flags
     update_frame_flags(&cpi->common, &cpi->refresh_frame, frame_flags);
+    set_additional_frame_flags(cm, frame_flags);
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     if (!ext_flags->refresh_frame.update_pending) {
       int ref_map_index =
           av1_get_refresh_ref_frame_map(cm->current_frame.refresh_frame_flags);
-      av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
-                               cm->show_existing_frame, ref_map_index,
-                               &cpi->ref_buffer_stack);
+      av1_update_ref_frame_map(cpi, frame_update_type,
+                               gf_group->refbuf_state[cpi->gf_frame_index],
+                               ref_map_index, &cpi->ref_buffer_stack);
     }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 
 #if !CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi)) {
 #if TXCOEFF_COST_TIMER
+  if (!is_stat_generation_stage(cpi)) {
     cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
     fprintf(stderr,
             "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
             "in us\n",
             cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
             cm->cum_txcoeff_cost_timer);
-#endif
-    if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
   }
+#endif
 #endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_TUNE_VMAF
@@ -1405,15 +1879,6 @@
     av1_update_vmaf_curve(cpi);
   }
 #endif
-  if (pop_lookahead == 1) {
-    av1_lookahead_pop(cpi->ppi->lookahead, flush, cpi->compressor_stage);
-  }
-
-  if (!is_stat_generation_stage(cpi)) {
-    update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
-    set_additional_frame_flags(cm, frame_flags);
-    update_rc_counts(cpi);
-  }
 
   // Unpack frame_results:
   *size = frame_results.size;
@@ -1423,7 +1888,5 @@
     cpi->droppable = is_frame_droppable(&cpi->svc, &ext_flags->refresh_frame);
   }
 
-  if (cpi->use_svc) av1_save_layer_context(cpi);
-
   return AOM_CODEC_OK;
 }

diff --git a/av1/encoder/encode_strategy.h b/av1/encoder/encode_strategy.h
index 351e8a1..15681c3 100644
--- a/av1/encoder/encode_strategy.h
+++ b/av1/encoder/encode_strategy.h

@@ -44,6 +44,7 @@
  * \param[out]   time_stamp  Time stamp of the frame
  * \param[out]   time_end    Time end
  * \param[in]    timestamp_ratio Time base
+ * \param[in]    pop_lookahead Decide to pop the source frame from queue
  * \param[in]    flush       Decide to encode one frame or the rest of frames
  *
  * \return Returns a value to indicate if the encoding is done successfully.
@@ -55,35 +56,102 @@
                         uint8_t *const dest, unsigned int *frame_flags,
                         int64_t *const time_stamp, int64_t *const time_end,
                         const aom_rational64_t *const timestamp_ratio,
-                        int flush);
+                        int *const pop_lookahead, int flush);
 
 /*!\cond */
 // Set individual buffer update flags based on frame reference type.
 // force_refresh_all is used when we have a KEY_FRAME or S_FRAME.  It forces all
 // refresh_*_frame flags to be set, because we refresh all buffers in this case.
-void av1_configure_buffer_updates(
-    AV1_COMP *const cpi, RefreshFrameFlagsInfo *const refresh_frame_flags,
-    const FRAME_UPDATE_TYPE type, const FRAME_TYPE frame_type,
-    int force_refresh_all);
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+                                  RefreshFrameInfo *const refresh_frame,
+                                  const FRAME_UPDATE_TYPE type,
+                                  const REFBUF_STATE refbuf_state,
+                                  int force_refresh_all);
 
 int av1_get_refresh_frame_flags(const AV1_COMP *const cpi,
                                 const EncodeFrameParams *const frame_params,
                                 FRAME_UPDATE_TYPE frame_update_type,
+                                int gf_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                int cur_disp_order,
+                                RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
                                 const RefBufferStack *const ref_buffer_stack);
 
 int av1_get_refresh_ref_frame_map(int refresh_frame_flags);
 
-void av1_update_ref_frame_map(AV1_COMP *cpi,
+void av1_update_ref_frame_map(const AV1_COMP *cpi,
                               FRAME_UPDATE_TYPE frame_update_type,
-                              FRAME_TYPE frame_type, int show_existing_frame,
-                              int ref_map_index,
+                              REFBUF_STATE refbuf_state, int ref_map_index,
                               RefBufferStack *ref_buffer_stack);
 
-void av1_get_ref_frames(AV1_COMP *const cpi, RefBufferStack *ref_buffer_stack);
+/*!\brief Obtain indices of reference frames from reference frame buffer stacks
+ *
+ * \callgraph
+ * \callergraph
+ *
+ * \param[in]    ref_buffer_stack  Data structure for reference frame buffer
+ *                                 stacks.
+ * \param[out]   remapped_ref_idx  An array for storing indices of reference
+ *                                 frames. The index is used to retrieve a
+ *                                 reference frame buffer from ref_frame_map
+ *                                 in AV1Common.
+ */
+void av1_get_ref_frames(const RefBufferStack *ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                        RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+                        int cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                        const AV1_COMP *cpi, int gf_index,
+                        int is_parallel_encode,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                        int remapped_ref_idx[REF_FRAMES]);
 
 int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
                                const int up_to_index,
                                const COMPRESSOR_STAGE compressor_stage);
+
+static AOM_INLINE int is_frame_droppable(
+    const SVC *const svc,
+    const ExtRefreshFrameFlagsInfo *const ext_refresh_frame_flags) {
+  // Droppable frame is only used by external refresh flags. VoD setting won't
+  // trigger its use case.
+  if (svc->set_ref_frame_config)
+    return svc->non_reference_frame;
+  else if (ext_refresh_frame_flags->update_pending)
+    return !(ext_refresh_frame_flags->alt_ref_frame ||
+             ext_refresh_frame_flags->alt2_ref_frame ||
+             ext_refresh_frame_flags->bwd_ref_frame ||
+             ext_refresh_frame_flags->golden_frame ||
+             ext_refresh_frame_flags->last_frame);
+  else
+    return 0;
+}
+
+static AOM_INLINE int get_current_frame_ref_type(const AV1_COMP *const cpi) {
+  // We choose the reference "type" of this frame from the flags which indicate
+  // which reference frames will be refreshed by it. More than one of these
+  // flags may be set, so the order here implies an order of precedence. This is
+  // just used to choose the primary_ref_frame (as the most recent reference
+  // buffer of the same reference-type as the current frame).
+
+  switch (cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index]) {
+    case 0: return 0;
+    case 1: return 1;
+    case MAX_ARF_LAYERS:
+    case MAX_ARF_LAYERS + 1: return 4;
+    default: return 7;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+int av1_calc_refresh_idx_for_intnl_arf(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES],
+    int gf_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 /*!\endcond */
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index e431448..ff9b8e0 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -23,7 +23,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
-#include "aom_ports/system_state.h"
 
 #if CONFIG_MISMATCH_DEBUG
 #include "aom_util/debug_util.h"
@@ -44,6 +43,7 @@
 #include "av1/common/tile_common.h"
 #include "av1/common/warped_motion.h"
 
+#include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
@@ -55,6 +55,7 @@
 #include "av1/encoder/encodetxb.h"
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/extend.h"
+#include "av1/encoder/intra_mode_search_utils.h"
 #include "av1/encoder/ml.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
@@ -150,7 +151,7 @@
                                            BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+      cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -163,42 +164,12 @@
   const uint16_t *high_var_offs[3] = { AV1_HIGH_VAR_OFFS_8,
                                        AV1_HIGH_VAR_OFFS_10,
                                        AV1_HIGH_VAR_OFFS_12 };
-  var =
-      cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                         CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0, &sse);
+  var = cpi->ppi->fn_ptr[bs].vf(ref->buf, ref->stride,
+                                CONVERT_TO_BYTEPTR(high_var_offs[off_index]), 0,
+                                &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
-                                                   const struct buf_2d *ref,
-                                                   int mi_row, int mi_col,
-                                                   BLOCK_SIZE bs) {
-  unsigned int sse, var;
-  uint8_t *last_y;
-  const YV12_BUFFER_CONFIG *last =
-      get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME);
-
-  assert(last != NULL);
-  last_y =
-      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
-  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
-}
-
-static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
-                                                   int mi_row, int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(
-      cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
-  if (var < 8)
-    return BLOCK_64X64;
-  else if (var < 128)
-    return BLOCK_32X32;
-  else if (var < 2048)
-    return BLOCK_16X16;
-  else
-    return BLOCK_8X8;
-}
-
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col, const int num_planes,
                           BLOCK_SIZE bsize) {
@@ -242,10 +213,11 @@
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   assert(delta_q_info->delta_q_present_flag);
 
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   // Delta-q modulation based on variance
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
 
+  const int delta_q_res = delta_q_info->delta_q_res;
   int current_qindex = cm->quant_params.base_qindex;
   if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL) {
     if (DELTA_Q_PERCEPTUAL_MODULATION == 1) {
@@ -265,30 +237,17 @@
     // Setup deltaq based on tpl stats
     current_qindex =
         av1_get_q_for_deltaq_objective(cpi, sb_size, mi_row, mi_col);
-  }
-
-  const int delta_q_res = delta_q_info->delta_q_res;
-  // Right now deltaq only works with tpl model. So if tpl is disabled, we set
-  // the current_qindex to base_qindex.
-  if (cpi->oxcf.algo_cfg.enable_tpl_model &&
-      cpi->oxcf.q_cfg.deltaq_mode != NO_DELTA_Q) {
-    current_qindex =
-        clamp(current_qindex, delta_q_res, 256 - delta_q_info->delta_q_res);
-  } else {
-    current_qindex = cm->quant_params.base_qindex;
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+    current_qindex = av1_get_sbq_perceptual_ai(cpi, sb_size, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    current_qindex = av1_get_sbq_user_rating_based(cpi, mi_row, mi_col);
+  } else if (cpi->oxcf.q_cfg.enable_hdr_deltaq) {
+    current_qindex = av1_get_q_for_hdr(cpi, x, sb_size, mi_row, mi_col);
   }
 
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int sign_deltaq_index =
-      current_qindex - xd->current_base_qindex >= 0 ? 1 : -1;
-  const int deltaq_deadzone = delta_q_res / 4;
-  const int qmask = ~(delta_q_res - 1);
-  int abs_deltaq_index = abs(current_qindex - xd->current_base_qindex);
-  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
-  current_qindex =
-      xd->current_base_qindex + sign_deltaq_index * abs_deltaq_index;
-  current_qindex = AOMMAX(current_qindex, MINQ + 1);
-  assert(current_qindex > 0);
+  current_qindex = av1_adjust_q_from_delta_q_res(
+      delta_q_res, xd->current_base_qindex, current_qindex);
 
   x->delta_qindex = current_qindex - cm->quant_params.base_qindex;
   av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
@@ -302,21 +261,21 @@
     const int delta_lf_res = delta_q_info->delta_lf_res;
     const int lfmask = ~(delta_lf_res - 1);
     const int delta_lf_from_base =
-        ((x->delta_qindex / 2 + delta_lf_res / 2) & lfmask);
+        ((x->delta_qindex / 4 + delta_lf_res / 2) & lfmask);
     const int8_t delta_lf =
         (int8_t)clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
     const int frame_lf_count =
         av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
-    const int mib_size = cm->seq_params.mib_size;
+    const int mib_size = cm->seq_params->mib_size;
 
     // pre-set the delta lf for loop filter. Note that this value is set
     // before mi is assigned for each block in current superblock
     for (int j = 0; j < AOMMIN(mib_size, mi_params->mi_rows - mi_row); j++) {
       for (int k = 0; k < AOMMIN(mib_size, mi_params->mi_cols - mi_col); k++) {
         const int grid_idx = get_mi_grid_idx(mi_params, mi_row + j, mi_col + k);
-        mi_params->mi_grid_base[grid_idx]->delta_lf_from_base = delta_lf;
+        mi_params->mi_alloc[grid_idx].delta_lf_from_base = delta_lf;
         for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
-          mi_params->mi_grid_base[grid_idx]->delta_lf[lf_id] = delta_lf;
+          mi_params->mi_alloc[grid_idx].delta_lf[lf_id] = delta_lf;
         }
       }
     }
@@ -326,32 +285,32 @@
 static void init_ref_frame_space(AV1_COMP *cpi, ThreadData *td, int mi_row,
                                  int mi_col) {
   const AV1_COMMON *cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   MACROBLOCK *x = &td->mb;
   const int frame_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
   av1_zero(x->tpl_keep_ref_frame);
 
-  if (frame_idx >= MAX_TPL_FRAME_IDX) return;
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
-  if (tpl_frame->is_valid == 0) return;
+  if (!av1_tpl_stats_ready(tpl_data, frame_idx)) return;
   if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
-  const int is_overlay = cpi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
+  const int is_overlay =
+      cpi->ppi->gf_group.update_type[frame_idx] == OVERLAY_UPDATE;
   if (is_overlay) {
     memset(x->tpl_keep_ref_frame, 1, sizeof(x->tpl_keep_ref_frame));
     return;
   }
 
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   const int tpl_stride = tpl_frame->stride;
   int64_t inter_cost[INTER_REFS_PER_FRAME] = { 0 };
   const int step = 1 << block_mis_log2;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   const int mi_row_end =
       AOMMIN(mi_size_high[sb_size] + mi_row, mi_params->mi_rows);
@@ -426,15 +385,15 @@
 
 static AOM_INLINE void adjust_rdmult_tpl_model(AV1_COMP *cpi, MACROBLOCK *x,
                                                int mi_row, int mi_col) {
-  const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
   const int orig_rdmult = cpi->rd.RDMULT;
 
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int gf_group_index = cpi->gf_frame_index;
   if (cpi->oxcf.algo_cfg.enable_tpl_model && cpi->oxcf.q_cfg.aq_mode == NO_AQ &&
       cpi->oxcf.q_cfg.deltaq_mode == NO_DELTA_Q && gf_group_index > 0 &&
-      cpi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
+      cpi->ppi->gf_group.update_type[gf_group_index] == ARF_UPDATE) {
     const int dr =
         av1_get_rdmult_delta(cpi, sb_size, mi_row, mi_col, orig_rdmult);
     x->rdmult = dr;
@@ -451,7 +410,7 @@
   MACROBLOCKD *xd = &x->e_mbd;
 
   // TODO(kyslov) Extend to 128x128
-  assert(cm->seq_params.sb_size == BLOCK_64X64);
+  assert(cm->seq_params->sb_size == BLOCK_64X64);
 
   av1_set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
@@ -512,7 +471,7 @@
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   // Grade the temporal variation of the sb, the grade will be used to decide
   // fast mode search strategy for coding blocks
@@ -540,27 +499,48 @@
     const BLOCK_SIZE bsize =
         seg_skip ? sb_size : sf->part_sf.fixed_partition_size;
     av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-  } else if (cpi->partition_search_skippable_frame) {
-    // set a fixed-size partition for which the size is determined by the source
-    // variance
-    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
   } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
     // set a variance-based partition
     av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
     av1_choose_var_based_partitioning(cpi, tile_info, td, x, mi_row, mi_col);
   }
   assert(sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip ||
-         cpi->partition_search_skippable_frame ||
          sf->part_sf.partition_search_type == VAR_BASED_PARTITION);
   set_cb_offsets(td->mb.cb_offset, 0, 0);
 
   // Adjust and encode the superblock
   PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+
+  // Initialize the flag to skip cdef to 1.
+  if (sf->rt_sf.skip_cdef_sb) {
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = 1;
+      }
+    }
+  }
+
   av1_nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
                           pc_root);
+
+  if (sf->rt_sf.skip_cdef_sb) {
+    // If 128x128 block is used, we need to set the flag for all 4 64x64 sub
+    // "blocks".
+    const int block64_in_sb = (sb_size == BLOCK_128X128) ? 2 : 1;
+    const int skip = mi[0]->skip_cdef_curr_sb;
+    for (int r = 0; r < block64_in_sb; ++r) {
+      for (int c = 0; c < block64_in_sb; ++c) {
+        const int idx_in_sb =
+            r * MI_SIZE_64X64 * cm->mi_params.mi_stride + c * MI_SIZE_64X64;
+        if (mi[idx_in_sb]) mi[idx_in_sb]->skip_cdef_curr_sb = skip;
+      }
+    }
+  }
   av1_free_pc_tree_recursive(pc_root, av1_num_planes(cm), 0, 0);
 }
 
@@ -582,24 +562,20 @@
        sf->part_sf.ml_early_term_after_part_split_level) &&
       !frame_is_intra_only(cm);
   if (use_simple_motion_search) {
-    init_simple_motion_search_mvs(sms_root);
+    av1_init_simple_motion_search_mvs_for_sb(cpi, tile_info, x, sms_root,
+                                             mi_row, mi_col);
   }
 
 #if !CONFIG_REALTIME_ONLY
-  if (has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
-      cpi->oxcf.gf_cfg.lag_in_frames == 0) {
-    (void)tile_info;
-    (void)mi_row;
-    (void)mi_col;
-    (void)gather_tpl_data;
-  } else {
+  if (!(has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+        cpi->oxcf.gf_cfg.lag_in_frames == 0)) {
     init_ref_frame_space(cpi, td, mi_row, mi_col);
     x->sb_energy_level = 0;
     x->part_search_info.cnn_output_valid = 0;
     if (gather_tpl_data) {
       if (cm->delta_q_info.delta_q_present_flag) {
         const int num_planes = av1_num_planes(cm);
-        const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+        const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
         setup_delta_q(cpi, td, x, tile_info, mi_row, mi_col, num_planes);
         av1_tpl_rdmult_setup_sb(cpi, x, sb_size, mi_row, mi_col);
       }
@@ -615,8 +591,7 @@
   (void)gather_tpl_data;
 #endif
 
-  // Reset hash state for transform/mode rd hash information
-  reset_hash_records(&x->txfm_search_info, cpi->sf.tx_sf.use_inter_txb_hash);
+  reset_hash_records(&x->txfm_search_info);
   av1_zero(x->picked_ref_frames_mask);
   av1_invalid_rd_stats(rd_cost);
 }
@@ -637,7 +612,7 @@
   const TileInfo *const tile_info = &tile_data->tile_info;
   MB_MODE_INFO **mi = cm->mi_params.mi_grid_base +
                       get_mi_grid_idx(&cm->mi_params, mi_row, mi_col);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   const int num_planes = av1_num_planes(cm);
   int dummy_rate;
   int64_t dummy_dist;
@@ -653,6 +628,9 @@
 
   // Encode the superblock
   if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, rd_use_partition_time);
+#endif
     // partition search starting from a variance-based partition
     av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
                                        sb_size);
@@ -661,6 +639,9 @@
     av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
                          &dummy_rate, &dummy_dist, 1, pc_root);
     av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, rd_use_partition_time);
+#endif
   }
 #if !CONFIG_REALTIME_ONLY
   else if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) {
@@ -673,17 +654,6 @@
     av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
                          &dummy_rate, &dummy_dist, 1, pc_root);
     av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
-  } else if (cpi->partition_search_skippable_frame) {
-    // partition search by adjusting a fixed-size partition for which the size
-    // is determined by the source variance
-    av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
-    const BLOCK_SIZE bsize =
-        get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
-    av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-    PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
-    av1_rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
-                         &dummy_rate, &dummy_dist, 1, pc_root);
-    av1_free_pc_tree_recursive(pc_root, num_planes, 0, 0);
   } else {
     // The most exhaustive recursive partition search
     SuperBlockEnc *sb_enc = &x->sb_enc;
@@ -709,10 +679,17 @@
 
     if (num_passes == 1) {
 #if CONFIG_PARTITION_SEARCH_ORDER
-      av1_reset_part_sf(&cpi->sf.part_sf);
-      RD_STATS this_rdc;
-      av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row, mi_col,
-                              sb_size, &this_rdc);
+      if (cpi->ext_part_controller.ready && !frame_is_intra_only(cm)) {
+        av1_reset_part_sf(&cpi->sf.part_sf);
+        RD_STATS this_rdc;
+        av1_rd_partition_search(cpi, td, tile_data, tp, sms_root, mi_row,
+                                mi_col, sb_size, &this_rdc);
+      } else {
+        PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
+        av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
+                              &dummy_rdc, dummy_rdc, pc_root, sms_root, NULL,
+                              SB_SINGLE_PASS, NULL);
+      }
 #else
       PC_TREE *const pc_root = av1_alloc_pc_tree_node(sb_size);
       av1_rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
@@ -758,8 +735,8 @@
 }
 
 static AOM_INLINE int is_rtc_mode(const CostUpdateFreq *cost_upd_freq,
-                                  int use_non_rd_mode) {
-  return (use_non_rd_mode && cost_upd_freq->coeff >= 2 &&
+                                  MODE mode) {
+  return ((mode == REALTIME) && cost_upd_freq->coeff >= 2 &&
           cost_upd_freq->mode >= 2 && cost_upd_freq->mv >= 2 &&
           cost_upd_freq->dv >= 2);
 }
@@ -784,14 +761,13 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-  const int mib_size = cm->seq_params.mib_size;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int mib_size = cm->seq_params->mib_size;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
   const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
   const CostUpdateFreq *const cost_upd_freq = &cpi->oxcf.cost_upd_freq;
-  const int rtc_mode = is_rtc_mode(cost_upd_freq, use_nonrd_mode);
-  const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
+  const int rtc_mode = is_rtc_mode(cost_upd_freq, cpi->oxcf.mode);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, encode_sb_row_time);
@@ -814,12 +790,12 @@
   // Code each SB in the row
   for (int mi_col = tile_info->mi_col_start, sb_col_in_tile = 0;
        mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
-    // In non-rd mode and when frequency of cost updates is off/tile, wait for
+    // In realtime mode and when frequency of cost updates is off/tile, wait for
     // the top superblock to finish encoding. Otherwise, wait for the top-right
     // superblock to finish encoding.
     (*(enc_row_mt->sync_read_ptr))(row_mt_sync, sb_row,
                                    sb_col_in_tile - rtc_mode);
-
+    const int update_cdf = tile_data->allow_update_cdf && row_mt_enabled;
     if (update_cdf && (tile_info->mi_row_start != mi_row)) {
       if ((tile_info->mi_col_start == mi_col)) {
         // restore frame context at the 1st column sb
@@ -841,6 +817,8 @@
     av1_set_cost_upd_freq(cpi, td, tile_info, mi_row, mi_col);
 
     // Reset color coding related parameters
+    x->color_sensitivity_sb[0] = 0;
+    x->color_sensitivity_sb[1] = 0;
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
     x->content_state_sb.source_sad = kMedSad;
@@ -863,6 +841,12 @@
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
+    // Produce the gradient data at superblock level, when intra mode pruning
+    // based on hog is enabled.
+    if (cpi->sf.intra_sf.intra_pruning_with_hog ||
+        cpi->sf.intra_sf.chroma_intra_pruning_with_hog)
+      produce_gradients_for_sb(cpi, x, sb_size, mi_row, mi_col);
+
     // encode the superblock
     if (use_nonrd_mode) {
       encode_nonrd_sb(cpi, td, tile_data, tp, mi_row, mi_col, seg_skip);
@@ -894,10 +878,10 @@
 
   // Copy data over into macro block data structures.
   av1_setup_src_planes(x, cpi->source, 0, 0, num_planes,
-                       cm->seq_params.sb_size);
+                       cm->seq_params->sb_size);
 
-  av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
-                         cm->seq_params.subsampling_y, num_planes);
+  av1_setup_block_planes(xd, cm->seq_params->subsampling_x,
+                         cm->seq_params->subsampling_y, num_planes);
 }
 
 void av1_alloc_tile_data(AV1_COMP *cpi) {
@@ -924,9 +908,8 @@
   TokenList *tplist = token_info->tplist[0][0];
   unsigned int tile_tok = 0;
   int tplist_count = 0;
-  const int use_nonrd_mode = cpi->sf.rt_sf.use_nonrd_pick_mode;
   const CostUpdateFreq *const cost_upd_freq = &cpi->oxcf.cost_upd_freq;
-  const int rtc_mode = is_rtc_mode(cost_upd_freq, use_nonrd_mode);
+  const int rtc_mode = is_rtc_mode(cost_upd_freq, cpi->oxcf.mode);
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
@@ -935,13 +918,14 @@
       TileInfo *const tile_info = &tile_data->tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
       tile_data->firstpass_top_mv = kZeroMv;
+      tile_data->abs_sum_level = 0;
 
       if (pre_tok != NULL && tplist != NULL) {
         token_info->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
         pre_tok = token_info->tile_tok[tile_row][tile_col];
-        tile_tok = allocated_tokens(*tile_info,
-                                    cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
-                                    num_planes);
+        tile_tok = allocated_tokens(
+            *tile_info, cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
+            num_planes);
         token_info->tplist[tile_row][tile_col] = tplist + tplist_count;
         tplist = token_info->tplist[tile_row][tile_col];
         tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
@@ -969,14 +953,14 @@
   TokenExtra *tok = NULL;
   TokenList *const tplist = cpi->token_info.tplist[tile_row][tile_col];
   const int sb_row_in_tile =
-      (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+      (mi_row - tile_info->mi_row_start) >> cm->seq_params->mib_size_log2;
   const int tile_mb_cols =
       (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
   const int num_mb_rows_in_sb =
-      ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+      ((1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
 
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
-                cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+                cm->seq_params->mib_size_log2 + MI_SIZE_LOG2, num_planes);
   assert(tplist != NULL);
   tplist[sb_row_in_tile].start = tok;
 
@@ -987,7 +971,7 @@
 
   assert((unsigned int)(tok - tplist[sb_row_in_tile].start) <=
          get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
-                         cm->seq_params.mib_size_log2 + MI_SIZE_LOG2,
+                         cm->seq_params->mib_size_log2 + MI_SIZE_LOG2,
                          num_planes));
 
   (void)tile_mb_cols;
@@ -1013,7 +997,7 @@
                          &td->mb.e_mbd);
 
   if (cpi->oxcf.intra_mode_cfg.enable_cfl_intra)
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
 
   if (td->mb.txfm_search_info.txb_rd_records != NULL) {
     av1_crc32c_calculator_init(
@@ -1021,9 +1005,10 @@
   }
 
   for (int mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += cm->seq_params.mib_size) {
+       mi_row += cm->seq_params->mib_size) {
     av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
   }
+  this_tile->abs_sum_level = td->abs_sum_level;
 }
 
 /*!\brief Break one frame into tiles and encode the tiles
@@ -1052,6 +1037,7 @@
           &cpi->tile_data[tile_row * cm->tiles.cols + tile_col];
       cpi->td.intrabc_used = 0;
       cpi->td.deltaq_used = 0;
+      cpi->td.abs_sum_level = 0;
       cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
       cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
       // Reset cyclic refresh counters.
@@ -1144,10 +1130,10 @@
   const int cur_offset = (int)cm->current_frame.order_hint;
   int ref_offset[2];
   get_skip_mode_ref_offsets(cm, ref_offset);
-  const int cur_to_ref0 = get_relative_dist(&cm->seq_params.order_hint_info,
+  const int cur_to_ref0 = get_relative_dist(&cm->seq_params->order_hint_info,
                                             cur_offset, ref_offset[0]);
-  const int cur_to_ref1 = abs(get_relative_dist(&cm->seq_params.order_hint_info,
-                                                cur_offset, ref_offset[1]));
+  const int cur_to_ref1 = abs(get_relative_dist(
+      &cm->seq_params->order_hint_info, cur_offset, ref_offset[1]));
   if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
 
   // High Latency: Turn off skip mode if all refs are fwd.
@@ -1250,7 +1236,7 @@
   FeatureFlags *const features = &cm->features;
   MACROBLOCKD *const xd = &x->e_mbd;
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
-  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
   IntraBCHashInfo *const intrabc_hash_info = &x->intrabc_hash_info;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt;
@@ -1282,7 +1268,7 @@
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     if (frame_probs->warped_probs[update_type] <
         cpi->sf.inter_sf.prune_warped_prob_thresh)
       features->allow_warped_motion = 0;
@@ -1320,7 +1306,7 @@
     // Hash data generated for screen contents is used for intraBC ME
     const int min_alloc_size = block_size_wide[mi_params->mi_alloc_bsize];
     const int max_sb_size =
-        (1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2));
+        (1 << (cm->seq_params->mib_size_log2 + MI_SIZE_LOG2));
     int src_idx = 0;
     for (int size = 4; size <= max_sb_size; size *= 2, src_idx = !src_idx) {
       const int dst_idx = !src_idx;
@@ -1373,6 +1359,12 @@
       cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_OBJECTIVE;
     else if (deltaq_mode == DELTA_Q_PERCEPTUAL)
       cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_PERCEPTUAL_AI)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_USER_RATING_BASED)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
+    else if (deltaq_mode == DELTA_Q_HDR)
+      cm->delta_q_info.delta_q_res = DEFAULT_DELTA_Q_RES_PERCEPTUAL;
     // Set delta_q_present_flag before it is used for the first time
     cm->delta_q_info.delta_lf_res = DEFAULT_DELTA_LF_RES;
     cm->delta_q_info.delta_q_present_flag = deltaq_mode != NO_DELTA_Q;
@@ -1381,7 +1373,7 @@
     // is used for ineligible frames. That effectively will turn off row_mt
     // usage. Note objective delta_q and tpl eligible frames are only altref
     // frames currently.
-    const GF_GROUP *gf_group = &cpi->gf_group;
+    const GF_GROUP *gf_group = &cpi->ppi->gf_group;
     if (cm->delta_q_info.delta_q_present_flag) {
       if (deltaq_mode == DELTA_Q_OBJECTIVE &&
           !is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
@@ -1449,6 +1441,7 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_setup_motion_field_time);
 #endif
+  av1_calculate_ref_frame_side(cm);
   if (features->allow_ref_frame_mvs) av1_setup_motion_field(cm);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_setup_motion_field_time);
@@ -1503,27 +1496,61 @@
   assert(oxcf->txfm_cfg.enable_tx64 || tx_search_type != USE_LARGESTALL);
   features->tx_mode = select_tx_mode(cm, tx_search_type);
 
-  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
-    const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Retain the frame level probability update conditions for parallel frames.
+  // These conditions will be consumed during postencode stage to update the
+  // probability.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+    cpi->do_update_frame_probs_txtype[cpi->num_frame_recode] =
+        cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats;
+    cpi->do_update_frame_probs_obmc[cpi->num_frame_recode] =
+        (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
+         cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX);
+    cpi->do_update_frame_probs_warp[cpi->num_frame_recode] =
+        (features->allow_warped_motion &&
+         cpi->sf.inter_sf.prune_warped_prob_thresh > 0);
+    cpi->do_update_frame_probs_interpfilter[cpi->num_frame_recode] =
+        (cm->current_frame.frame_type != KEY_FRAME &&
+         cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
+         features->interp_filter == SWITCHABLE);
+  }
+#endif
 
+  if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats ||
+      ((cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh !=
+        INT_MAX) &&
+       (cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != 0))) {
+    const FRAME_UPDATE_TYPE update_type =
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     for (i = 0; i < TX_SIZES_ALL; i++) {
       int sum = 0;
       int j;
-      int left = 1024;
+      int left = MAX_TX_TYPE_PROB;
 
       for (j = 0; j < TX_TYPES; j++)
         sum += cpi->td.rd_counts.tx_type_used[i][j];
 
       for (j = TX_TYPES - 1; j >= 0; j--) {
+        int update_txtype_frameprobs = 1;
         const int new_prob =
-            sum ? 1024 * cpi->td.rd_counts.tx_type_used[i][j] / sum
-                : (j ? 0 : 1024);
-        int prob =
-            (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
-        left -= prob;
-        if (j == 0) prob += left;
-        frame_probs->tx_type_probs[update_type][i][j] = prob;
+            sum ? MAX_TX_TYPE_PROB * cpi->td.rd_counts.tx_type_used[i][j] / sum
+                : (j ? 0 : MAX_TX_TYPE_PROB);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_txtype_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .tx_type_probs[update_type][i][j] = new_prob;
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+        if (update_txtype_frameprobs) {
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+        }
       }
     }
   }
@@ -1531,35 +1558,59 @@
   if (cpi->sf.inter_sf.prune_obmc_prob_thresh > 0 &&
       cpi->sf.inter_sf.prune_obmc_prob_thresh < INT_MAX) {
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < BLOCK_SIZES_ALL; i++) {
       int sum = 0;
+      int update_obmc_frameprobs = 1;
       for (int j = 0; j < 2; j++) sum += cpi->td.rd_counts.obmc_used[i][j];
 
       const int new_prob =
           sum ? 128 * cpi->td.rd_counts.obmc_used[i][1] / sum : 0;
-      frame_probs->obmc_probs[update_type][i] =
-          (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Track the frame probabilities of parallel encode frames to update
+      // during postencode stage.
+      if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        update_obmc_frameprobs = 0;
+        cpi->frame_new_probs[cpi->num_frame_recode].obmc_probs[update_type][i] =
+            new_prob;
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      if (update_obmc_frameprobs) {
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+      }
     }
   }
 
   if (features->allow_warped_motion &&
       cpi->sf.inter_sf.prune_warped_prob_thresh > 0) {
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    int update_warp_frameprobs = 1;
     int sum = 0;
     for (i = 0; i < 2; i++) sum += cpi->td.rd_counts.warped_used[i];
     const int new_prob = sum ? 128 * cpi->td.rd_counts.warped_used[1] / sum : 0;
-    frame_probs->warped_probs[update_type] =
-        (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Track the frame probabilities of parallel encode frames to update
+    // during postencode stage.
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      update_warp_frameprobs = 0;
+      cpi->frame_new_probs[cpi->num_frame_recode].warped_probs[update_type] =
+          new_prob;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    if (update_warp_frameprobs) {
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+    }
   }
 
   if (cm->current_frame.frame_type != KEY_FRAME &&
       cpi->sf.interp_sf.adaptive_interp_filter_search == 2 &&
       features->interp_filter == SWITCHABLE) {
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
       int sum = 0;
@@ -1571,15 +1622,27 @@
       }
 
       for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+        int update_interpfilter_frameprobs = 1;
         const int new_prob =
             sum ? 1536 * cpi->td.counts->switchable_interp[i][j] / sum
                 : (j ? 0 : 1536);
-        int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
-                    new_prob) >>
-                   1;
-        left -= prob;
-        if (j == 0) prob += left;
-        frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        // Track the frame probabilities of parallel encode frames to update
+        // during postencode stage.
+        if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+          update_interpfilter_frameprobs = 0;
+          cpi->frame_new_probs[cpi->num_frame_recode]
+              .switchable_interp_probs[update_type][i][j] = new_prob;
+        }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+        if (update_interpfilter_frameprobs) {
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+        }
       }
     }
   }
@@ -1633,7 +1696,8 @@
   (void)num_planes;
 #endif
 
-  if (cpi->sf.hl_sf.frame_parameter_update) {
+  if (cpi->sf.hl_sf.frame_parameter_update ||
+      cpi->sf.rt_sf.use_comp_ref_nonrd) {
     RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     if (frame_is_intra_only(cm))
@@ -1677,6 +1741,10 @@
         features->tx_mode = TX_MODE_LARGEST;
     }
   } else {
+    // This is needed if real-time speed setting is changed on the fly
+    // from one using compound prediction to one using single reference.
+    if (current_frame->reference_mode == REFERENCE_MODE_SELECT)
+      current_frame->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
 }

diff --git a/av1/encoder/encodeframe_utils.c b/av1/encoder/encodeframe_utils.c
index c9b1afb..9fbd68c 100644
--- a/av1/encoder/encodeframe_utils.c
+++ b/av1/encoder/encodeframe_utils.c

@@ -9,22 +9,13 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
-
-static AOM_INLINE int set_deltaq_rdmult(const AV1_COMP *const cpi,
-                                        const MACROBLOCK *const x) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const CommonQuantParams *quant_params = &cm->quant_params;
-  return av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
-                                      quant_params->y_dc_delta_q);
-}
+#include "av1/encoder/aq_variance.h"
 
 void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
                          const BLOCK_SIZE bsize, const int mi_row,
@@ -44,7 +35,6 @@
 
   assert(cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM);
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col / num_mi_h;
@@ -59,20 +49,29 @@
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
   av1_set_error_per_bit(errorperbit, *rdmult);
-  aom_clear_system_state();
+}
+
+// TODO(angiebird): Move these function to tpl_model.c
+#if !CONFIG_REALTIME_ONLY
+static AOM_INLINE int set_deltaq_rdmult(const AV1_COMP *const cpi,
+                                        const MACROBLOCK *const x) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonQuantParams *quant_params = &cm->quant_params;
+  return av1_compute_rd_mult(cpi, quant_params->base_qindex + x->delta_qindex +
+                                      quant_params->y_dc_delta_q);
 }
 
 // Return the end column for the current superblock, in unit of TPL blocks.
 static int get_superblock_tpl_column_end(const AV1_COMMON *const cm, int mi_col,
                                          int num_mi_w) {
   // Find the start column of this superblock.
-  const int sb_mi_col_start = (mi_col >> cm->seq_params.mib_size_log2)
-                              << cm->seq_params.mib_size_log2;
+  const int sb_mi_col_start = (mi_col >> cm->seq_params->mib_size_log2)
+                              << cm->seq_params->mib_size_log2;
   // Same but in superres upscaled dimension.
   const int sb_mi_col_start_sr =
       coded_to_superres_mi(sb_mi_col_start, cm->superres_scale_denominator);
   // Width of this superblock in mi units.
-  const int sb_mi_width = mi_size_wide[cm->seq_params.sb_size];
+  const int sb_mi_width = mi_size_wide[cm->seq_params->sb_size];
   // Same but in superres upscaled dimension.
   const int sb_mi_width_sr =
       coded_to_superres_mi(sb_mi_width, cm->superres_scale_denominator);
@@ -86,14 +85,12 @@
                             const BLOCK_SIZE bsize, const int mi_row,
                             const int mi_col, int orig_rdmult) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
   const int deltaq_rdmult = set_deltaq_rdmult(cpi, x);
-  if (tpl_idx >= MAX_TPL_FRAME_IDX) return deltaq_rdmult;
-  const TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
-  if (tpl_frame->is_valid == 0) return deltaq_rdmult;
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, tpl_idx)) return deltaq_rdmult;
   if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index))
     return deltaq_rdmult;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return deltaq_rdmult;
@@ -117,7 +114,6 @@
   int row, col;
   double base_block_count = 0.0;
   double geom_mean_of_scale = 0.0;
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
@@ -125,7 +121,7 @@
          col < sb_bcol_end;
          ++col) {
       const int index = row * num_cols + col;
-      geom_mean_of_scale += log(cpi->tpl_sb_rdmult_scaling_factors[index]);
+      geom_mean_of_scale += log(cpi->ppi->tpl_sb_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
@@ -133,14 +129,16 @@
   int rdmult = (int)((double)orig_rdmult * geom_mean_of_scale + 0.5);
   rdmult = AOMMAX(rdmult, 0);
   av1_set_error_per_bit(&x->errorperbit, rdmult);
-  aom_clear_system_state();
-  if (bsize == cm->seq_params.sb_size) {
+#if !CONFIG_RD_COMMAND
+  if (bsize == cm->seq_params->sb_size) {
     const int rdmult_sb = set_deltaq_rdmult(cpi, x);
     assert(rdmult_sb == rdmult);
     (void)rdmult_sb;
   }
+#endif  // !CONFIG_RD_COMMAND
   return rdmult;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static AOM_INLINE void update_filter_type_count(FRAME_COUNTS *counts,
                                                 const MACROBLOCKD *xd,
@@ -162,9 +160,16 @@
   } else if (tx_mode != TX_MODE_SELECT) {
     mbmi->tx_size = tx_size_from_tx_mode(mbmi->bsize, tx_mode);
   } else {
-    BLOCK_SIZE bsize = mbmi->bsize;
-    TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
-    mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+    const BLOCK_SIZE bsize = mbmi->bsize;
+    const TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+    if (tx_size_wide[min_tx_size] > tx_size_wide[mbmi->tx_size] ||
+        tx_size_high[min_tx_size] > tx_size_high[mbmi->tx_size])
+      mbmi->tx_size = min_tx_size;
+
+    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+    if (tx_size_wide[max_tx_size] < tx_size_wide[mbmi->tx_size] ||
+        tx_size_high[max_tx_size] < tx_size_high[mbmi->tx_size])
+      mbmi->tx_size = max_tx_size;
   }
   if (is_inter_block(mbmi)) {
     memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
@@ -342,7 +347,7 @@
 
   const int x_mis = AOMMIN(bw, mi_params->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, mi_params->mi_rows - mi_row);
-  if (cm->seq_params.order_hint_info.enable_ref_frame_mvs)
+  if (cm->seq_params->order_hint_info.enable_ref_frame_mvs)
     av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
 }
 
@@ -605,9 +610,9 @@
                                      MB_MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+  for (r = 0; r < cm->seq_params->mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+    for (c = 0; c < cm->seq_params->mib_size; c += bw) {
       const int grid_index = get_mi_grid_idx(&cm->mi_params, r, c);
       const int mi_index = get_alloc_mi_idx(&cm->mi_params, r, c);
       mib[grid_index] = mi + mi_index;
@@ -639,11 +644,11 @@
   assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
   // Apply the requested partition size to the SB if it is all "in image"
-  if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
-      (mi_rows_remaining >= cm->seq_params.mib_size)) {
-    for (int block_row = 0; block_row < cm->seq_params.mib_size;
+  if ((mi_cols_remaining >= cm->seq_params->mib_size) &&
+      (mi_rows_remaining >= cm->seq_params->mib_size)) {
+    for (int block_row = 0; block_row < cm->seq_params->mib_size;
          block_row += bh) {
-      for (int block_col = 0; block_col < cm->seq_params.mib_size;
+      for (int block_col = 0; block_col < cm->seq_params->mib_size;
            block_col += bw) {
         const int grid_index = get_mi_grid_idx(mi_params, block_row, block_col);
         const int mi_index = get_alloc_mi_idx(mi_params, block_row, block_col);
@@ -683,25 +688,27 @@
 int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int orig_rdmult) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
 
-  if (tpl_idx >= MAX_TPL_FRAME_IDX) return orig_rdmult;
-
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   int tpl_stride = tpl_frame->stride;
-  if (tpl_frame->is_valid == 0) return orig_rdmult;
 
-  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return orig_rdmult;
+  if (!av1_tpl_stats_ready(&cpi->ppi->tpl_data, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
+  if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+    return orig_rdmult;
+  }
 
   int mi_count = 0;
   const int mi_col_sr =
@@ -728,8 +735,6 @@
   }
   assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
 
-  aom_clear_system_state();
-
   double beta = 1.0;
   if (mc_dep_cost > 0 && intra_cost > 0) {
     const double r0 = cpi->rd.r0;
@@ -739,8 +744,6 @@
 
   int rdmult = av1_get_adaptive_rdmult(cpi, beta);
 
-  aom_clear_system_state();
-
   rdmult = AOMMIN(rdmult, orig_rdmult * 3 / 2);
   rdmult = AOMMAX(rdmult, orig_rdmult * 1 / 2);
 
@@ -761,7 +764,7 @@
   if (is_stat_consumption_stage_twopass(cpi)) {
     const AV1_COMMON *const cm = &cpi->common;
     const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
     if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
@@ -791,7 +794,7 @@
   if (is_stat_consumption_stage_twopass(cpi)) {
     const AV1_COMMON *const cm = &cpi->common;
     const FIRSTPASS_STATS *const this_frame_stats = read_one_frame_stats(
-        &cpi->twopass, cm->current_frame.display_order_hint);
+        &cpi->ppi->twopass, cm->current_frame.display_order_hint);
     if (this_frame_stats == NULL) return AOM_CODEC_ERROR;
 
     // The inactive region is specified in MBs not mi units.
@@ -816,24 +819,22 @@
   if (!cpi->oxcf.algo_cfg.enable_tpl_model) return;
   if (cpi->common.current_frame.frame_type == KEY_FRAME) return;
   const FRAME_UPDATE_TYPE update_type =
-      get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
   if (update_type == INTNL_OVERLAY_UPDATE || update_type == OVERLAY_UPDATE)
     return;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
 
   AV1_COMMON *const cm = &cpi->common;
   const int gf_group_index = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, gf_group_index)) return;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
 
-  if (gf_group_index >= MAX_TPL_FRAME_IDX) return;
-
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_group_index];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   int tpl_stride = tpl_frame->stride;
-  if (tpl_frame->is_valid == 0) return;
 
   int mi_count = 0;
   int count = 0;
@@ -892,11 +893,11 @@
 int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                    int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
   int64_t intra_cost = 0;
   int64_t mc_dep_cost = 0;
@@ -909,7 +910,7 @@
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   int tpl_stride = tpl_frame->stride;
-  if (tpl_frame->is_valid == 0) return base_qindex;
+  if (!tpl_frame->is_valid) return base_qindex;
 
   if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return base_qindex;
 
@@ -938,8 +939,6 @@
   }
   assert(mi_count <= MAX_TPL_BLK_IN_SB * MAX_TPL_BLK_IN_SB);
 
-  aom_clear_system_state();
-
   int offset = 0;
   double beta = 1.0;
   if (mc_dep_cost > 0 && intra_cost > 0) {
@@ -948,8 +947,7 @@
     beta = (r0 / rk);
     assert(beta > 0.0);
   }
-  offset = av1_get_deltaq_offset(cpi, base_qindex, beta);
-  aom_clear_system_state();
+  offset = av1_get_deltaq_offset(cm->seq_params->bit_depth, base_qindex, beta);
 
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
@@ -960,6 +958,49 @@
 
   return qindex;
 }
+
+#if !DISABLE_HDR_LUMA_DELTAQ
+// offset table defined in Table3 of T-REC-H.Sup15 document.
+static const int hdr_thres[HDR_QP_LEVELS + 1] = { 0,   301, 367, 434, 501, 567,
+                                                  634, 701, 767, 834, 1024 };
+
+static const int hdr10_qp_offset[HDR_QP_LEVELS] = { 3,  2,  1,  0,  -1,
+                                                    -2, -3, -4, -5, -6 };
+#endif
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  AV1_COMMON *const cm = &cpi->common;
+  assert(cm->seq_params->bit_depth == AOM_BITS_10);
+
+#if DISABLE_HDR_LUMA_DELTAQ
+  (void)x;
+  (void)bsize;
+  (void)mi_row;
+  (void)mi_col;
+  return cm->quant_params.base_qindex;
+#else
+  // calculate pixel average
+  const int block_luma_avg = av1_log_block_avg(cpi, x, bsize, mi_row, mi_col);
+  // adjust offset based on average of the pixel block
+  int offset = 0;
+  for (int i = 0; i < HDR_QP_LEVELS; i++) {
+    if (block_luma_avg >= hdr_thres[i] && block_luma_avg < hdr_thres[i + 1]) {
+      offset = (int)(hdr10_qp_offset[i] * QP_SCALE_FACTOR);
+      break;
+    }
+  }
+
+  const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
+  offset = AOMMIN(offset, delta_q_info->delta_q_res * 9 - 1);
+  offset = AOMMAX(offset, -delta_q_info->delta_q_res * 9 + 1);
+  int qindex = cm->quant_params.base_qindex + offset;
+  qindex = AOMMIN(qindex, MAXQ);
+  qindex = AOMMAX(qindex, MINQ);
+
+  return qindex;
+#endif
+}
 #endif  // !CONFIG_REALTIME_ONLY
 
 void av1_reset_simple_motion_tree_partition(SIMPLE_MOTION_DATA_TREE *sms_tree,
@@ -1167,7 +1208,7 @@
 void av1_source_content_sb(AV1_COMP *cpi, MACROBLOCK *x, int offset) {
   unsigned int tmp_sse;
   unsigned int tmp_variance;
-  const BLOCK_SIZE bsize = cpi->common.seq_params.sb_size;
+  const BLOCK_SIZE bsize = cpi->common.seq_params->sb_size;
   uint8_t *src_y = cpi->source->y_buffer;
   int src_ystride = cpi->source->y_stride;
   uint8_t *last_src_y = cpi->last_source->y_buffer;
@@ -1181,8 +1222,8 @@
 #endif
   src_y += offset;
   last_src_y += offset;
-  tmp_variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                       last_src_ystride, &tmp_sse);
+  tmp_variance = cpi->ppi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
+                                            last_src_ystride, &tmp_sse);
   if (tmp_sse < avg_source_sse_threshold)
     x->content_state_sb.source_sad = kLowSad;
   else if (tmp_sse > avg_source_sse_threshold_high)
@@ -1236,7 +1277,7 @@
 
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   xd->above_txfm_context =
       cm->above_contexts.txfm[tile_info->tile_row] + mi_col;
@@ -1272,7 +1313,7 @@
 
   const AV1_COMMON *cm = &cpi->common;
   const int num_planes = av1_num_planes(cm);
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   av1_restore_context(x, &sb_fp_stats->x_ctx, mi_row, mi_col, sb_size,
                       num_planes);
@@ -1347,7 +1388,7 @@
   // cost calculation is skipped in this case.
   if (frame_is_intra_only(cm)) return 1;
 
-  return skip_cost_update(&cm->seq_params, tile_info, mi_row, mi_col,
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
                           cpi->sf.inter_sf.mv_cost_upd_level);
 }
 
@@ -1361,7 +1402,7 @@
     return 1;
   }
 
-  return skip_cost_update(&cm->seq_params, tile_info, mi_row, mi_col,
+  return skip_cost_update(cm->seq_params, tile_info, mi_row, mi_col,
                           cpi->sf.intra_sf.dv_cost_upd_level);
 }
 
@@ -1383,6 +1424,9 @@
       if (mi_col != tile_info->mi_col_start) break;
       AOM_FALLTHROUGH_INTENDED;
     case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.coeff_cost_upd_level == INTERNAL_COST_UPD_SBROW &&
+          mi_col != tile_info->mi_col_start)
+        break;
       av1_fill_coeff_costs(&x->coeff_costs, xd->tile_ctx, num_planes);
       break;
     default: assert(0);
@@ -1396,6 +1440,9 @@
       if (mi_col != tile_info->mi_col_start) break;
       AOM_FALLTHROUGH_INTENDED;
     case COST_UPD_SB:  // SB level
+      if (cpi->sf.inter_sf.mode_cost_upd_level == INTERNAL_COST_UPD_SBROW &&
+          mi_col != tile_info->mi_col_start)
+        break;
       av1_fill_mode_rates(cm, &x->mode_costs, xd->tile_ctx);
       break;
     default: assert(0);

diff --git a/av1/encoder/encodeframe_utils.h b/av1/encoder/encodeframe_utils.h
index cd024f2..3604616 100644
--- a/av1/encoder/encodeframe_utils.h
+++ b/av1/encoder/encodeframe_utils.h

@@ -13,18 +13,67 @@
 #define AOM_AV1_ENCODER_ENCODEFRAME_UTILS_H_
 
 #include "aom_ports/aom_timer.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/rdopt.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define WRITE_FEATURE_TO_FILE 0
+
+#define FEATURE_SIZE_SMS_SPLIT_FAST 6
+#define FEATURE_SIZE_SMS_SPLIT 17
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+#define FEATURE_SMS_NONE_FLAG 1
+#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
+#define FEATURE_SMS_RECT_FLAG (1 << 2)
+
+#define FEATURE_SMS_PRUNE_PART_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
+#define FEATURE_SMS_SPLIT_MODEL_FLAG \
+  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
+
+// Number of sub-partitions in rectangular partition types.
+#define SUB_PARTITIONS_RECT 2
+
+// Number of sub-partitions in split partition type.
+#define SUB_PARTITIONS_SPLIT 4
+
+// Number of sub-partitions in AB partition types.
+#define SUB_PARTITIONS_AB 3
+
+// Number of sub-partitions in 4-way partition types.
+#define SUB_PARTITIONS_PART4 4
+
+// 4part partition types.
+enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
+
+// AB partition types.
+enum {
+  HORZ_A = 0,
+  HORZ_B,
+  VERT_A,
+  VERT_B,
+  NUM_AB_PARTS
+} UENUM1BYTE(AB_PART_TYPE);
+
+// Rectangular partition types.
+enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
+
+// Structure to keep win flags for HORZ and VERT partition evaluations.
+typedef struct {
+  int rect_part_win[NUM_RECT_PARTS];
+} RD_RECT_PART_WIN_INFO;
+
 enum { PICK_MODE_RD = 0, PICK_MODE_NONRD };
 
 enum {
@@ -156,12 +205,27 @@
   int is_split_ctx_is_ready[2];
   int is_rect_ctx_is_ready[NUM_RECT_PARTS];
 
-  // Flags to prune/skip particular partition size evaluation.
+  // If true, skips the rest of partition evaluation at the current bsize level.
   int terminate_partition_search;
+
+  // If false, skips rdopt on PARTITION_NONE.
   int partition_none_allowed;
+
+  // If partition_rect_allowed[HORZ] is false, skips searching PARTITION_HORZ,
+  // PARTITION_HORZ_A, PARTITIO_HORZ_B, PARTITION_HORZ_4. Same holds for VERT.
   int partition_rect_allowed[NUM_RECT_PARTS];
+
+  // If false, skips searching rectangular partition unless some logic related
+  // to edge detection holds.
   int do_rectangular_split;
+
+  // If false, skips searching PARTITION_SPLIT.
   int do_square_split;
+
+  // If true, prunes the corresponding PARTITION_HORZ/PARTITION_VERT. Note that
+  // this does not directly affect the extended partitions, so this can be used
+  // to prune out PARTITION_HORZ/PARTITION_VERT while still allowing rdopt of
+  // PARTITION_HORZ_AB4, etc.
   int prune_rect_part[NUM_RECT_PARTS];
 
   // Chroma subsampling in x and y directions.
@@ -179,6 +243,48 @@
 #endif  // CONFIG_COLLECT_PARTITION_STATS
 } PartitionSearchState;
 
+static AOM_INLINE void av1_disable_square_split_partition(
+    PartitionSearchState *part_state) {
+  part_state->do_square_split = 0;
+}
+
+// Disables all possible rectangular splits. This includes PARTITION_AB4 as they
+// depend on the corresponding partition_rect_allowed.
+static AOM_INLINE void av1_disable_rect_partitions(
+    PartitionSearchState *part_state) {
+  part_state->do_rectangular_split = 0;
+  part_state->partition_rect_allowed[HORZ] = 0;
+  part_state->partition_rect_allowed[VERT] = 0;
+}
+
+// Disables all possible splits so that only PARTITION_NONE *might* be allowed.
+static AOM_INLINE void av1_disable_all_splits(
+    PartitionSearchState *part_state) {
+  av1_disable_square_split_partition(part_state);
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE void av1_set_square_split_only(
+    PartitionSearchState *part_state) {
+  part_state->partition_none_allowed = 0;
+  part_state->do_square_split = 1;
+  av1_disable_rect_partitions(part_state);
+}
+
+static AOM_INLINE bool av1_blk_has_rows_and_cols(
+    const PartitionBlkParams *blk_params) {
+  return blk_params->has_rows && blk_params->has_cols;
+}
+
+static AOM_INLINE bool av1_is_whole_blk_in_frame(
+    const PartitionBlkParams *blk_params,
+    const CommonModeInfoParams *mi_params) {
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+  return mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
+         mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+}
+
 static AOM_INLINE void update_filter_type_cdf(const MACROBLOCKD *xd,
                                               const MB_MODE_INFO *mbmi,
                                               int dual_filter) {
@@ -196,14 +302,13 @@
                                          int8_t segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
   av1_init_plane_quantizers(cpi, x, segment_id);
-  aom_clear_system_state();
   const int segment_qindex =
       av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
   return av1_compute_rd_mult(cpi,
                              segment_qindex + cm->quant_params.y_dc_delta_q);
 }
 
-static AOM_INLINE int do_slipt_check(BLOCK_SIZE bsize) {
+static AOM_INLINE int do_split_check(BLOCK_SIZE bsize) {
   return (bsize == BLOCK_16X16 || bsize == BLOCK_32X32);
 }
 
@@ -219,47 +324,6 @@
   return &p->stats_buf_ctx->stats_in_start[frm];
 }
 
-static BLOCK_SIZE dim_to_size(int dim) {
-  switch (dim) {
-    case 4: return BLOCK_4X4;
-    case 8: return BLOCK_8X8;
-    case 16: return BLOCK_16X16;
-    case 32: return BLOCK_32X32;
-    case 64: return BLOCK_64X64;
-    case 128: return BLOCK_128X128;
-    default: assert(0); return 0;
-  }
-}
-
-static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
-                                                  AV1_COMP *cpi, MACROBLOCK *x,
-                                                  const SPEED_FEATURES *sf,
-                                                  BLOCK_SIZE sb_size,
-                                                  int mi_row, int mi_col) {
-  const AV1_COMMON *cm = &cpi->common;
-
-  sb_enc->max_partition_size =
-      AOMMIN(sf->part_sf.default_max_partition_size,
-             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
-  sb_enc->min_partition_size =
-      AOMMAX(sf->part_sf.default_min_partition_size,
-             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
-  sb_enc->max_partition_size =
-      AOMMIN(sb_enc->max_partition_size, cm->seq_params.sb_size);
-  sb_enc->min_partition_size =
-      AOMMIN(sb_enc->min_partition_size, cm->seq_params.sb_size);
-
-  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
-    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
-
-    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
-    sb_enc->max_partition_size =
-        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
-                      sb_enc->max_partition_size),
-               sb_enc->min_partition_size);
-  }
-}
-
 int av1_get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
                          int mi_col, int orig_rdmult);
 
@@ -272,16 +336,19 @@
 
 int av1_get_q_for_deltaq_objective(AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                    int mi_row, int mi_col);
+
+int av1_get_q_for_hdr(AV1_COMP *const cpi, MACROBLOCK *const x,
+                      BLOCK_SIZE bsize, int mi_row, int mi_col);
+
+int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int orig_rdmult);
 #endif  // !CONFIG_REALTIME_ONLY
 
 void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit,
                          const BLOCK_SIZE bsize, const int mi_row,
                          const int mi_col, int *const rdmult);
 
-int av1_get_hier_tpl_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                            const BLOCK_SIZE bsize, const int mi_row,
-                            const int mi_col, int orig_rdmult);
-
 void av1_update_state(const AV1_COMP *const cpi, ThreadData *td,
                       const PICK_MODE_CONTEXT *const ctx, int mi_row,
                       int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run);
@@ -338,25 +405,23 @@
 
 static AOM_INLINE void av1_dealloc_mb_data(struct AV1Common *cm,
                                            struct macroblock *mb) {
-  if (mb->txfm_search_info.txb_rd_records) {
-    aom_free(mb->txfm_search_info.txb_rd_records);
-    mb->txfm_search_info.txb_rd_records = NULL;
-  }
+  aom_free(mb->txfm_search_info.txb_rd_records);
+  mb->txfm_search_info.txb_rd_records = NULL;
+
+  aom_free(mb->inter_modes_info);
+  mb->inter_modes_info = NULL;
+
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; plane++) {
-    if (mb->plane[plane].src_diff) {
-      aom_free(mb->plane[plane].src_diff);
-      mb->plane[plane].src_diff = NULL;
-    }
+    aom_free(mb->plane[plane].src_diff);
+    mb->plane[plane].src_diff = NULL;
   }
-  if (mb->e_mbd.seg_mask) {
-    aom_free(mb->e_mbd.seg_mask);
-    mb->e_mbd.seg_mask = NULL;
-  }
-  if (mb->winner_mode_stats) {
-    aom_free(mb->winner_mode_stats);
-    mb->winner_mode_stats = NULL;
-  }
+
+  aom_free(mb->e_mbd.seg_mask);
+  mb->e_mbd.seg_mask = NULL;
+
+  aom_free(mb->winner_mode_stats);
+  mb->winner_mode_stats = NULL;
 }
 
 static AOM_INLINE void av1_alloc_mb_data(struct AV1Common *cm,
@@ -365,11 +430,16 @@
   if (!use_nonrd_pick_mode) {
     mb->txfm_search_info.txb_rd_records =
         (TxbRdRecords *)aom_malloc(sizeof(TxbRdRecords));
+    if (!frame_is_intra_only(cm))
+      CHECK_MEM_ERROR(
+          cm, mb->inter_modes_info,
+          (InterModesInfo *)aom_malloc(sizeof(*mb->inter_modes_info)));
   }
   const int num_planes = av1_num_planes(cm);
   for (int plane = 0; plane < num_planes; plane++) {
     const int subsampling_xy =
-        plane ? cm->seq_params.subsampling_x + cm->seq_params.subsampling_y : 0;
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
     const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
     CHECK_MEM_ERROR(cm, mb->plane[plane].src_diff,
                     (int16_t *)aom_memalign(
@@ -410,10 +480,8 @@
 #if !CONFIG_REALTIME_ONLY
       else if (is_stat_consumption_stage_twopass(cpi)) {
         const FIRSTPASS_STATS *const this_frame_stats =
-            read_one_frame_stats(&cpi->twopass, cur_frame_display_index);
-        aom_clear_system_state();
-        const double coded_error_per_mb =
-            this_frame_stats->coded_error / cpi->frame_info.num_mbs;
+            read_one_frame_stats(&cpi->ppi->twopass, cur_frame_display_index);
+        const double coded_error_per_mb = this_frame_stats->coded_error;
         // Disable LAST2_FRAME if the coded error of the current frame based on
         // first pass stats is very low.
         if (coded_error_per_mb < 100.0) num_refs_to_disable++;

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 91b5974..2a875e1 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c

@@ -35,19 +35,19 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
 
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride) {
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride) {
   assert(rows >= 4 && cols >= 4);
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (is_cur_buf_hbd(xd)) {
+  if (bd_info.use_highbitdepth_buf) {
     aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
-                              pred8, pred_stride, xd->bd);
+                              pred8, pred_stride, bd_info.bit_depth);
     return;
   }
 #endif
-  (void)xd;
+  (void)bd_info;
   aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
                      pred_stride);
 }
@@ -55,6 +55,7 @@
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
   const int diff_stride = block_size_wide[plane_bsize];
@@ -66,8 +67,8 @@
   uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
   int16_t *src_diff =
       &p->src_diff[(blk_row * diff_stride + blk_col) << MI_SIZE_LOG2];
-  av1_subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
-                     src_stride, dst, dst_stride);
+  av1_subtract_block(bd_info, tx1d_height, tx1d_width, src_diff, diff_stride,
+                     src, src_stride, dst, dst_stride);
 }
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE plane_bsize, int plane) {
@@ -77,9 +78,10 @@
   const int bw = block_size_wide[plane_bsize];
   const int bh = block_size_high[plane_bsize];
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
 
-  av1_subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
-                     pd->dst.buf, pd->dst.stride);
+  av1_subtract_block(bd_info, bh, bw, p->src_diff, bw, p->src.buf,
+                     p->src.stride, pd->dst.buf, pd->dst.stride);
 }
 
 int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
@@ -132,13 +134,8 @@
 
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex) {
-  const struct macroblock_plane *const p = &mb->plane[plane];
-  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
-  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
   const int tx_width = tx_size_wide[tx_size];
   const int tx_height = tx_size_high[tx_size];
-  const int max_eob = av1_get_max_eob(tx_size);
-  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
 
   // Early return if `qindex` is out of range.
   if (qindex > DROPOUT_Q_MAX || qindex < DROPOUT_Q_MIN) {
@@ -156,6 +153,19 @@
       multiplier *
       CLIP(base_size, DROPOUT_AFTER_BASE_MIN, DROPOUT_AFTER_BASE_MAX);
 
+  av1_dropout_qcoeff_num(mb, plane, block, tx_size, tx_type, dropout_num_before,
+                         dropout_num_after);
+}
+
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after) {
+  const struct macroblock_plane *const p = &mb->plane[plane];
+  tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
+  tran_low_t *const dqcoeff = p->dqcoeff + BLOCK_OFFSET(block);
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
   // Early return if there are not enough non-zero coefficients.
   if (p->eobs[block] == 0 || p->eobs[block] <= dropout_num_before) {
     return;
@@ -172,7 +182,8 @@
 
   for (int i = 0; i < p->eobs[block]; ++i) {
     const int scan_idx = scan_order->scan[i];
-    if (qcoeff[scan_idx] > DROPOUT_COEFF_MAX) {  // Keep large coefficients.
+    if (abs(qcoeff[scan_idx]) > DROPOUT_COEFF_MAX) {
+      // Keep large coefficients.
       count_zeros_before = 0;
       count_zeros_after = 0;
       idx = -1;
@@ -197,6 +208,7 @@
     if (count_nonzeros > DROPOUT_CONTINUITY_MAX) {
       count_zeros_before = 0;
       count_zeros_after = 0;
+      count_nonzeros = 0;
       idx = -1;
       eob = i + 1;
     }

diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index fcd34a3..f2dc956 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h

@@ -123,11 +123,16 @@
 //   `txb_entropy_ctx`, which `mb` points to, may be modified by this function.
 void av1_dropout_qcoeff(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                         TX_TYPE tx_type, int qindex);
+// Same as above, with the number of zeroes needed before/after a coeff to drop
+// it explicitly passed in, instead of being derived from qindex.
+void av1_dropout_qcoeff_num(MACROBLOCK *mb, int plane, int block,
+                            TX_SIZE tx_size, TX_TYPE tx_type,
+                            int dropout_num_before, int dropout_num_after);
 
-void av1_subtract_block(const MACROBLOCKD *xd, int rows, int cols,
-                        int16_t *diff, ptrdiff_t diff_stride,
-                        const uint8_t *src8, ptrdiff_t src_stride,
-                        const uint8_t *pred8, ptrdiff_t pred_stride);
+void av1_subtract_block(BitDepthInfo bd_info, int rows, int cols, int16_t *diff,
+                        ptrdiff_t diff_stride, const uint8_t *src8,
+                        ptrdiff_t src_stride, const uint8_t *pred8,
+                        ptrdiff_t pred_stride);
 
 void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
                       int blk_col, int blk_row, TX_SIZE tx_size);

diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 86c6156..4a7d874 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c

@@ -173,8 +173,8 @@
   }
 }
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp) {
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
   // If the mv_diff is zero, then we should have used near or nearest instead.
@@ -193,8 +193,7 @@
   // motion vector component used.
   if (cpi->sf.mv_sf.auto_mv_step_size) {
     int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
-    cpi->mv_search_params.max_mv_magnitude =
-        AOMMAX(maxv, cpi->mv_search_params.max_mv_magnitude);
+    td->max_mv_magnitude = AOMMAX(maxv, td->max_mv_magnitude);
   }
 }
 

diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 9f0d607..962844b 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h

@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
-                   nmv_context *mvctx, int usehp);
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, ThreadData *td, const MV *mv,
+                   const MV *ref, nmv_context *mvctx, int usehp);
 
 void av1_update_mv_stats(const MV *mv, const MV *ref, nmv_context *mvctx,
                          MvSubpelPrecision precision);

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index ed1ab8e..7eb4db5 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -32,7 +32,6 @@
 #endif
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 #if CONFIG_BITSTREAM_DEBUG
 #include "aom_util/debug_util.h"
@@ -46,11 +45,13 @@
 #include "av1/common/resize.h"
 #include "av1/common/tile_common.h"
 
+#include "av1/encoder/allintra_vis.h"
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/context_tree.h"
+#include "av1/encoder/dwt.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encode_strategy.h"
@@ -61,6 +62,7 @@
 #include "av1/encoder/ethread.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/hash_motion.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
 #include "av1/encoder/intra_mode_search.h"
 #include "av1/encoder/mv_prec.h"
 #include "av1/encoder/pass2_strategy.h"
@@ -75,16 +77,13 @@
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/superres_scale.h"
+#include "av1/encoder/thirdpass.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/var_based_part.h"
 
 #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
 
-#if CONFIG_ENTROPY_STATS
-FRAME_COUNTS aggregate_fc;
-#endif  // CONFIG_ENTROPY_STATS
-
 // #define OUTPUT_YUV_REC
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
@@ -228,7 +227,7 @@
   const int upscaled_width = cm->superres_upscaled_width;
   const int height = cm->height;
   const int luma_pic_size = upscaled_width * height;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int pic_size_profile_factor =
       profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
@@ -242,7 +241,7 @@
 static void set_tile_info(AV1_COMMON *const cm,
                           const TileConfig *const tile_cfg) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   CommonTileParams *const tiles = &cm->tiles;
   int i, start_sb;
 
@@ -298,7 +297,7 @@
 
   // We need to reallocate the context buffers here in case we need more mis.
   if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate context buffers");
   }
   av1_init_mi_buffers(&cm->mi_params);
@@ -309,7 +308,9 @@
     alloc_context_buffers_ext(cm, &cpi->mbmi_ext_info);
 
   if (!cpi->ppi->seq_params_locked)
-    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+    set_sb_size(cm->seq_params,
+                av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                   cpi->svc.number_spatial_layers));
 
   set_tile_info(cm, &cpi->oxcf.tile_cfg);
 }
@@ -327,9 +328,9 @@
          height <= lvl_height * lvl_dim_mult;
 }
 
-static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
-                                     int width, int height,
-                                     double init_framerate) {
+static void set_bitstream_level_tier(AV1_PRIMARY *const ppi, int width,
+                                     int height, double init_framerate) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
   // TODO(any): This is a placeholder function that only addresses dimensions
   // and max display sample rates.
   // Need to add checks for max bit rate, max decoded luma sample rate, header
@@ -372,26 +373,26 @@
     level = SEQ_LEVEL_6_2;
   }
 
-  SequenceHeader *const seq_params = &cm->seq_params;
   for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    seq->seq_level_idx[i] = level;
+    seq_params->seq_level_idx[i] = level;
     // Set the maximum parameters for bitrate and buffer size for this profile,
     // level, and tier
     seq_params->op_params[i].bitrate = av1_max_level_bitrate(
-        cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
+        seq_params->profile, seq_params->seq_level_idx[i], seq_params->tier[i]);
     // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
     // check
     if (seq_params->op_params[i].bitrate == 0)
       aom_internal_error(
-          &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+          &ppi->error, AOM_CODEC_UNSUP_BITSTREAM,
           "AV1 does not support this combination of profile, level, and tier.");
     // Buffer size in bits/s is bitrate in bits/s * 1 s
     seq_params->op_params[i].buffer_size = seq_params->op_params[i].bitrate;
   }
 }
 
-void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
                                const AV1EncoderConfig *oxcf, int use_svc) {
+  SequenceHeader *const seq = &ppi->seq_params;
   const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
   const ToolCfg *const tool_cfg = &oxcf->tool_cfg;
 
@@ -449,7 +450,7 @@
   seq->enable_intra_edge_filter = oxcf->intra_mode_cfg.enable_intra_edge_filter;
   seq->enable_filter_intra = oxcf->intra_mode_cfg.enable_filter_intra;
 
-  set_bitstream_level_tier(seq, cm, frm_dim_cfg->width, frm_dim_cfg->height,
+  set_bitstream_level_tier(ppi, frm_dim_cfg->width, frm_dim_cfg->height,
                            oxcf->input_cfg.init_framerate);
 
   if (seq->operating_points_cnt_minus_1 == 0) {
@@ -461,26 +462,27 @@
     // skip decoding enhancement  layers (temporal first).
     int i = 0;
     assert(seq->operating_points_cnt_minus_1 ==
-           (int)(cm->number_spatial_layers * cm->number_temporal_layers - 1));
-    for (unsigned int sl = 0; sl < cm->number_spatial_layers; sl++) {
-      for (unsigned int tl = 0; tl < cm->number_temporal_layers; tl++) {
+           (int)(ppi->number_spatial_layers * ppi->number_temporal_layers - 1));
+    for (unsigned int sl = 0; sl < ppi->number_spatial_layers; sl++) {
+      for (unsigned int tl = 0; tl < ppi->number_temporal_layers; tl++) {
         seq->operating_point_idc[i] =
-            (~(~0u << (cm->number_spatial_layers - sl)) << 8) |
-            ~(~0u << (cm->number_temporal_layers - tl));
+            (~(~0u << (ppi->number_spatial_layers - sl)) << 8) |
+            ~(~0u << (ppi->number_temporal_layers - tl));
         i++;
       }
     }
   }
 }
 
-static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+static void init_config_sequence(struct AV1_PRIMARY *ppi,
+                                 AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
   const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
   const ColorCfg *const color_cfg = &oxcf->color_cfg;
-  cpi->oxcf = *oxcf;
-  cpi->framerate = oxcf->input_cfg.init_framerate;
+
+  ppi->use_svc = 0;
+  ppi->number_spatial_layers = 1;
+  ppi->number_temporal_layers = 1;
 
   seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
@@ -508,7 +510,7 @@
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
         dec_model_cfg->num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -546,11 +548,20 @@
       }
     }
   }
+  av1_change_config_seq(ppi, oxcf, NULL);
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+  AV1_COMMON *const cm = &cpi->common;
+  ResizePendingParams *resize_pending_params = &cpi->resize_pending_params;
+
+  cpi->oxcf = *oxcf;
+  cpi->framerate = oxcf->input_cfg.init_framerate;
 
   cm->width = oxcf->frm_dim_cfg.width;
   cm->height = oxcf->frm_dim_cfg.height;
-  set_sb_size(seq_params,
-              av1_select_sb_size(cpi));  // set sb size before allocations
+  cpi->is_dropped_frame = false;
+
   alloc_compressor_data(cpi);
 
   av1_update_film_grain_parameters(cpi, oxcf);
@@ -559,18 +570,15 @@
   cpi->td.counts = &cpi->counts;
 
   // Set init SVC parameters.
-  cpi->use_svc = 0;
   cpi->svc.set_ref_frame_config = 0;
   cpi->svc.non_reference_frame = 0;
   cpi->svc.number_spatial_layers = 1;
   cpi->svc.number_temporal_layers = 1;
-  cm->number_spatial_layers = 1;
-  cm->number_temporal_layers = 1;
   cm->spatial_layer_id = 0;
   cm->temporal_layer_id = 0;
 
   // change includes all joint functionality
-  av1_change_config(cpi, oxcf);
+  av1_change_config(cpi, oxcf, true);
 
   cpi->ref_frame_flags = 0;
 
@@ -583,25 +591,13 @@
   av1_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
 }
 
-void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
-  AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
-  RATE_CONTROL *const rc = &cpi->rc;
-  MACROBLOCK *const x = &cpi->td.mb;
-  AV1LevelParams *const level_params = &cpi->level_params;
-  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+void av1_change_config_seq(struct AV1_PRIMARY *ppi,
+                           const AV1EncoderConfig *oxcf,
+                           bool *is_sb_size_changed) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const FrameDimensionCfg *const frm_dim_cfg = &oxcf->frm_dim_cfg;
   const DecoderModelCfg *const dec_model_cfg = &oxcf->dec_model_cfg;
   const ColorCfg *const color_cfg = &oxcf->color_cfg;
-  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-  // in case of LAP, lag in frames is set according to number of lap buffers
-  // calculated at init time. This stores and restores LAP's lag in frames to
-  // prevent override by new cfg.
-  int lap_lag_in_frames = -1;
-  if (cpi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
-    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
-  }
 
   if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
   seq_params->bit_depth = oxcf->tool_cfg.bit_depth;
@@ -632,7 +628,7 @@
     // set the decoder model parameters in schedule mode
     seq_params->decoder_model_info.num_units_in_decoding_tick =
         dec_model_cfg->num_units_in_decoding_tick;
-    cm->buffer_removal_time_present = 1;
+    ppi->buffer_removal_time_present = 1;
     av1_set_aom_dec_model_info(&seq_params->decoder_model_info);
     av1_set_dec_model_op_parameters(&seq_params->op_params[0]);
   } else if (seq_params->timing_info_present &&
@@ -645,6 +641,59 @@
         10;  // Default value (not signaled)
   }
 
+  av1_update_film_grain_parameters_seq(ppi, oxcf);
+
+  int sb_size = seq_params->sb_size;
+  // Superblock size should not be updated after the first key frame.
+  if (!ppi->seq_params_locked) {
+    set_sb_size(seq_params, av1_select_sb_size(oxcf, frm_dim_cfg->width,
+                                               frm_dim_cfg->height,
+                                               ppi->number_spatial_layers));
+    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
+  }
+  if (is_sb_size_changed != NULL && sb_size != seq_params->sb_size)
+    *is_sb_size_changed = true;
+
+  // Init sequence level coding tools
+  // This should not be called after the first key frame.
+  if (!ppi->seq_params_locked) {
+    seq_params->operating_points_cnt_minus_1 =
+        (ppi->number_spatial_layers > 1 || ppi->number_temporal_layers > 1)
+            ? ppi->number_spatial_layers * ppi->number_temporal_layers - 1
+            : 0;
+    av1_init_seq_coding_tools(ppi, oxcf, ppi->use_svc);
+  }
+  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+  highbd_set_var_fns(ppi);
+#endif
+
+  set_primary_rc_buffer_sizes(oxcf, ppi);
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool is_sb_size_changed) {
+  AV1_COMMON *const cm = &cpi->common;
+  SequenceHeader *const seq_params = cm->seq_params;
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  MACROBLOCK *const x = &cpi->td.mb;
+  AV1LevelParams *const level_params = &cpi->ppi->level_params;
+  InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
+  RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
+  // in case of LAP, lag in frames is set according to number of lap buffers
+  // calculated at init time. This stores and restores LAP's lag in frames to
+  // prevent override by new cfg.
+  int lap_lag_in_frames = -1;
+  if (cpi->ppi->lap_enabled && cpi->compressor_stage == LAP_STAGE) {
+    lap_lag_in_frames = cpi->oxcf.gf_cfg.lag_in_frames;
+  }
+
   av1_update_film_grain_parameters(cpi, oxcf);
 
   cpi->oxcf = *oxcf;
@@ -680,14 +729,14 @@
                         seq_params->tier[0]);
   }
 
-  if ((has_no_stats_stage(cpi)) && (rc_cfg->mode == AOM_Q)) {
-    rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (has_no_stats_stage(cpi) && (rc_cfg->mode == AOM_Q)) {
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
   } else {
-    rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+    p_rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  refresh_frame_flags->golden_frame = false;
-  refresh_frame_flags->bwd_ref_frame = false;
+  refresh_frame->golden_frame = false;
+  refresh_frame->bwd_ref_frame = false;
 
   cm->features.refresh_frame_context =
       (oxcf->tool_cfg.frame_parallel_decoding_mode)
@@ -702,7 +751,7 @@
   }
 
   if (x->comp_rd_buffer.pred0 == NULL) {
-    alloc_compound_type_rd_buffers(cm, &x->comp_rd_buffer);
+    alloc_compound_type_rd_buffers(cm->error, &x->comp_rd_buffer);
   }
 
   if (x->tmp_conv_dst == NULL) {
@@ -724,12 +773,11 @@
 
   av1_set_high_precision_mv(cpi, 1, 0);
 
-  set_rc_buffer_sizes(rc, rc_cfg);
-
   // Under a configuration change, where maximum_buffer_size may change,
   // keep buffer level clipped to the maximum allowed buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
 
   // Set up frame rate and related parameters rate control values.
   av1_new_framerate(cpi, cpi->framerate);
@@ -752,18 +800,9 @@
   cm->width = frm_dim_cfg->width;
   cm->height = frm_dim_cfg->height;
 
-  int sb_size = seq_params->sb_size;
-  // Superblock size should not be updated after the first key frame.
-  if (!cpi->ppi->seq_params_locked) {
-    set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
-    for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
-      seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
-  }
-
-  if (initial_dimensions->width || sb_size != seq_params->sb_size) {
+  if (initial_dimensions->width || is_sb_size_changed) {
     if (cm->width > initial_dimensions->width ||
-        cm->height > initial_dimensions->height ||
-        seq_params->sb_size != sb_size) {
+        cm->height > initial_dimensions->height || is_sb_size_changed) {
       av1_free_context_buffers(cm);
       av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf);
       av1_free_sms_tree(&cpi->td);
@@ -784,23 +823,11 @@
     cpi->ext_flags.refresh_frame.update_pending = 0;
   cpi->ext_flags.refresh_frame_context_pending = 0;
 
-#if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
-#endif
-
-  // Init sequence level coding tools
-  // This should not be called after the first key frame.
-  if (!cpi->ppi->seq_params_locked) {
-    seq_params->operating_points_cnt_minus_1 =
-        (cm->number_spatial_layers > 1 || cm->number_temporal_layers > 1)
-            ? cm->number_spatial_layers * cm->number_temporal_layers - 1
-            : 0;
-    av1_init_seq_coding_tools(&cm->seq_params, cm, oxcf, cpi->use_svc);
-  }
-
-  if (cpi->use_svc)
+  if (cpi->ppi->use_svc)
     av1_update_layer_context_change_config(cpi, rc_cfg->target_bandwidth);
 
+  check_reset_rc_flag(cpi);
+
   // restore the value of lag_in_frame for LAP stage.
   if (lap_lag_in_frames != -1) {
     cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
@@ -810,7 +837,7 @@
 static INLINE void init_frame_info(FRAME_INFO *frame_info,
                                    const AV1_COMMON *const cm) {
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   frame_info->frame_width = cm->width;
   frame_info->frame_height = cm->height;
   frame_info->mi_cols = mi_params->mi_cols;
@@ -835,75 +862,42 @@
 }
 
 AV1_PRIMARY *av1_create_primary_compressor(
-    struct aom_codec_pkt_list *pkt_list_head) {
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    AV1EncoderConfig *oxcf) {
   AV1_PRIMARY *volatile const ppi = aom_memalign(32, sizeof(AV1_PRIMARY));
   if (!ppi) return NULL;
   av1_zero(*ppi);
 
-  ppi->seq_params_locked = 0;
-  ppi->output_pkt_list = pkt_list_head;
-  return ppi;
-}
-
-AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
-                                BufferPool *const pool,
-                                FIRSTPASS_STATS *frame_stats_buf,
-                                COMPRESSOR_STAGE stage, int num_lap_buffers,
-                                int lap_lag_in_frames,
-                                STATS_BUFFER_CTX *stats_buf_context) {
-  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
-  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
-
-  if (!cm) return NULL;
-
-  av1_zero(*cpi);
-
-  cpi->ppi = ppi;
-
   // The jmp_buf is valid only for the duration of the function that calls
   // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
   // before it returns.
-  if (setjmp(cm->error.jmp)) {
-    cm->error.setjmp = 0;
-    av1_remove_compressor(cpi);
+  if (setjmp(ppi->error.jmp)) {
+    ppi->error.setjmp = 0;
+    av1_remove_primary_compressor(ppi);
     return 0;
   }
+  ppi->error.setjmp = 1;
 
-  cm->error.setjmp = 1;
-  cpi->lap_enabled = num_lap_buffers > 0;
-  cpi->compressor_stage = stage;
+  ppi->seq_params_locked = 0;
+  ppi->lap_enabled = num_lap_buffers > 0;
+  ppi->output_pkt_list = pkt_list_head;
+  ppi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+  ppi->frames_left = oxcf->input_cfg.limit;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  ppi->num_fp_contexts = 1;
+#endif
 
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
-  mi_params->free_mi = enc_free_mi;
-  mi_params->setup_mi = enc_setup_mi;
-  mi_params->set_mb_mi = (oxcf->pass == 1 || cpi->compressor_stage == LAP_STAGE)
-                             ? stat_stage_set_mb_mi
-                             : enc_set_mb_mi;
+  init_config_sequence(ppi, oxcf);
 
-  mi_params->mi_alloc_bsize = BLOCK_4X4;
+#if CONFIG_ENTROPY_STATS
+  av1_zero(ppi->aggregate_fc);
+#endif  // CONFIG_ENTROPY_STATS
 
-  CHECK_MEM_ERROR(cm, cm->fc,
-                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
-  CHECK_MEM_ERROR(
-      cm, cm->default_frame_context,
-      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
-  memset(cm->fc, 0, sizeof(*cm->fc));
-  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
-
-  cpi->common.buffer_pool = pool;
-
-  init_config(cpi, oxcf);
-  if (cpi->compressor_stage == LAP_STAGE) {
-    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
-  }
-
-  cpi->frames_left = cpi->oxcf.input_cfg.limit;
-
-  av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+  av1_primary_rc_init(oxcf, &ppi->p_rc);
 
   // For two pass and lag_in_frames > 33 in LAP.
-  cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
-  if (cpi->lap_enabled) {
+  ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_2;
+  if (ppi->lap_enabled) {
     if ((num_lap_buffers <
          (MAX_GF_LENGTH_LAP + SCENE_CUT_KEY_TEST_INTERVAL + 1)) &&
         num_lap_buffers >= (MAX_GF_LENGTH_LAP + 3)) {
@@ -911,215 +905,22 @@
        * For lag in frames >= 19 and <33, enable scenecut
        * with limited future frame prediction.
        */
-      cpi->rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
+      ppi->p_rc.enable_scenecut_detection = ENABLE_SCENECUT_MODE_1;
     } else if (num_lap_buffers < (MAX_GF_LENGTH_LAP + 3)) {
       // Disable scenecut when lag_in_frames < 19.
-      cpi->rc.enable_scenecut_detection = DISABLE_SCENECUT;
+      ppi->p_rc.enable_scenecut_detection = DISABLE_SCENECUT;
     }
   }
-  init_frame_info(&cpi->frame_info, cm);
-  init_frame_index_set(&cpi->frame_index_set);
-
-  cm->current_frame.frame_number = 0;
-  cm->current_frame_id = -1;
-  cpi->partition_search_skippable_frame = 0;
-  cpi->tile_data = NULL;
-  cpi->last_show_frame_buf = NULL;
-  realloc_segmentation_maps(cpi);
-
-  cpi->refresh_frame.alt_ref_frame = false;
-
-  cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
-#if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_blockiness = 1;
-  cpi->b_calculate_consistency = 1;
-  cpi->total_inconsistency = 0;
-  cpi->psnr[0].worst = 100.0;
-  cpi->psnr[1].worst = 100.0;
-  cpi->worst_ssim = 100.0;
-  cpi->worst_ssim_hbd = 100.0;
-
-  cpi->count[0] = 0;
-  cpi->count[1] = 0;
-  cpi->bytes = 0;
-#if CONFIG_SPEED_STATS
-  cpi->tx_search_count = 0;
-#endif  // CONFIG_SPEED_STATS
-
-  if (cpi->b_calculate_psnr) {
-    cpi->total_sq_error[0] = 0;
-    cpi->total_samples[0] = 0;
-    cpi->total_sq_error[1] = 0;
-    cpi->total_samples[1] = 0;
-    cpi->tot_recode_hits = 0;
-    cpi->summed_quality = 0;
-    cpi->summed_weights = 0;
-    cpi->summed_quality_hbd = 0;
-    cpi->summed_weights_hbd = 0;
-  }
-
-  cpi->fastssim.worst = 100.0;
-  cpi->psnrhvs.worst = 100.0;
-
-  if (cpi->b_calculate_blockiness) {
-    cpi->total_blockiness = 0;
-    cpi->worst_blockiness = 0.0;
-  }
-
-  if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(
-        cm, cpi->ssim_vars,
-        aom_malloc(sizeof(*cpi->ssim_vars) * 4 * cpi->common.mi_params.mi_rows *
-                   cpi->common.mi_params.mi_cols));
-    cpi->worst_consistency = 100.0;
-  }
-#endif
-#if CONFIG_ENTROPY_STATS
-  av1_zero(aggregate_fc);
-#endif  // CONFIG_ENTROPY_STATS
-
-  cpi->time_stamps.first_ts_start = INT64_MAX;
-
-#ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("rec.yuv", "wb");
-#endif
-#ifdef OUTPUT_YUV_DENOISED
-  yuv_denoised_file = fopen("denoised.yuv", "wb");
-#endif
-
-  assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS);
-  int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS);
-  for (int i = 0; i < size; i++)
-    cpi->twopass.frame_stats_arr[i] = &frame_stats_buf[i];
-
-  cpi->twopass.stats_buf_ctx = stats_buf_context;
-  cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-
-#if !CONFIG_REALTIME_ONLY
-  if (is_stat_consumption_stage(cpi)) {
-    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
-    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
-
-    if (!cpi->lap_enabled) {
-      /*Re-initialize to stats buffer, populated by application in the case of
-       * two pass*/
-      cpi->twopass.stats_buf_ctx->stats_in_start = oxcf->twopass_stats_in.buf;
-      cpi->twopass.stats_in = cpi->twopass.stats_buf_ctx->stats_in_start;
-      cpi->twopass.stats_buf_ctx->stats_in_end =
-          &cpi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
-
-      av1_init_second_pass(cpi);
-    } else {
-      av1_init_single_pass_lap(cpi);
-    }
-  }
-#endif
-
-  alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm);
-
-  CHECK_MEM_ERROR(
-      cm, cpi->td.mb.inter_modes_info,
-      (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
-
-  for (int x = 0; x < 2; x++)
-    for (int y = 0; y < 2; y++)
-      CHECK_MEM_ERROR(
-          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
-          (uint32_t *)aom_malloc(
-              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
-
-  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
-
-  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
-  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
-
-  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
-                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
-                             sizeof(*cpi->consec_zero_mv)));
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->tpl_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_rdmult_scaling_factors)));
-    CHECK_MEM_ERROR(cm, cpi->tpl_sb_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->tpl_sb_rdmult_scaling_factors)));
-  }
-
-  {
-    const int bsize = BLOCK_16X16;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
-  }
-
-#if CONFIG_TUNE_VMAF
-  {
-    const int bsize = BLOCK_64X64;
-    const int w = mi_size_wide[bsize];
-    const int h = mi_size_high[bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
-                    aom_calloc(num_rows * num_cols,
-                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
-    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
-      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
-      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
-      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
-    }
-    cpi->vmaf_info.original_qindex = -1;
-    cpi->vmaf_info.vmaf_model = NULL;
-  }
-#endif
-
-#if CONFIG_TUNE_BUTTERAUGLI
-  {
-    const int w = mi_size_wide[butteraugli_rdo_bsize];
-    const int h = mi_size_high[butteraugli_rdo_bsize];
-    const int num_cols = (mi_params->mi_cols + w - 1) / w;
-    const int num_rows = (mi_params->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(
-        cm, cpi->butteraugli_info.rdmult_scaling_factors,
-        aom_malloc(num_rows * num_cols *
-                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
-    memset(&cpi->butteraugli_info.source, 0,
-           sizeof(cpi->butteraugli_info.source));
-    memset(&cpi->butteraugli_info.resized_source, 0,
-           sizeof(cpi->butteraugli_info.resized_source));
-    cpi->butteraugli_info.recon_set = false;
-  }
-#endif
-
-#if !CONFIG_REALTIME_ONLY
-  if (!is_stat_generation_stage(cpi)) {
-    av1_setup_tpl_buffers(cm, &cpi->tpl_data, cpi->oxcf.gf_cfg.lag_in_frames);
-  }
-#endif
-
-#if CONFIG_COLLECT_PARTITION_STATS
-  av1_zero(cpi->partition_stats);
-#endif  // CONFIG_COLLECT_PARTITION_STATS
 
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                              \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
+  ppi->fn_ptr[BT].sdf = SDF;                                    \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                  \
+  ppi->fn_ptr[BT].vf = VF;                                      \
+  ppi->fn_ptr[BT].svf = SVF;                                    \
+  ppi->fn_ptr[BT].svaf = SVAF;                                  \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                              \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
@@ -1232,9 +1033,9 @@
 
 #if !CONFIG_REALTIME_ONLY
 #define OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;    \
-  cpi->fn_ptr[BT].ovf = OVF;      \
-  cpi->fn_ptr[BT].osvf = OSVF;
+  ppi->fn_ptr[BT].osdf = OSDF;    \
+  ppi->fn_ptr[BT].ovf = OVF;      \
+  ppi->fn_ptr[BT].osvf = OSVF;
 
   OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
        aom_obmc_sub_pixel_variance128x128)
@@ -1283,8 +1084,8 @@
 #endif  // !CONFIG_REALTIME_ONLY
 
 #define MBFP(BT, MCSDF, MCSVF)  \
-  cpi->fn_ptr[BT].msdf = MCSDF; \
-  cpi->fn_ptr[BT].msvf = MCSVF;
+  ppi->fn_ptr[BT].msdf = MCSDF; \
+  ppi->fn_ptr[BT].msvf = MCSVF;
 
   MBFP(BLOCK_128X128, aom_masked_sad128x128,
        aom_masked_sub_pixel_variance128x128)
@@ -1314,8 +1115,8 @@
 #endif
 
 #define SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;   \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+  ppi->fn_ptr[BT].sdsf = SDSF;   \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
   SDSFP(BLOCK_128X128, aom_sad_skip_128x128, aom_sad_skip_128x128x4d);
   SDSFP(BLOCK_128X64, aom_sad_skip_128x64, aom_sad_skip_128x64x4d);
@@ -1345,16 +1146,289 @@
 #undef SDSFP
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  highbd_set_var_fns(cpi);
+  highbd_set_var_fns(ppi);
 #endif
 
+  {
+    // As cm->mi_params is a part of the frame level context (cpi), it is
+    // unavailable at this point. mi_params is created as a local temporary
+    // variable, to be passed into the functions used for allocating tpl
+    // buffers. The values in this variable are populated according to initial
+    // width and height of the frame.
+    CommonModeInfoParams mi_params;
+    enc_set_mb_mi(&mi_params, oxcf->frm_dim_cfg.width,
+                  oxcf->frm_dim_cfg.height);
+
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params.mi_cols + w - 1) / w;
+    const int num_rows = (mi_params.mi_rows + h - 1) / h;
+    AOM_CHECK_MEM_ERROR(&ppi->error, ppi->tpl_rdmult_scaling_factors,
+                        aom_calloc(num_rows * num_cols,
+                                   sizeof(*ppi->tpl_rdmult_scaling_factors)));
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, ppi->tpl_sb_rdmult_scaling_factors,
+        aom_calloc(num_rows * num_cols,
+                   sizeof(*ppi->tpl_sb_rdmult_scaling_factors)));
+
+#if !CONFIG_REALTIME_ONLY
+    if (oxcf->pass != AOM_RC_FIRST_PASS) {
+      av1_setup_tpl_buffers(ppi, &mi_params, oxcf->frm_dim_cfg.width,
+                            oxcf->frm_dim_cfg.height, 0,
+                            oxcf->gf_cfg.lag_in_frames);
+    }
+#endif
+
+#if CONFIG_INTERNAL_STATS
+    ppi->b_calculate_blockiness = 1;
+    ppi->b_calculate_consistency = 1;
+
+    for (int i = 0; i <= STAT_ALL; i++) {
+      ppi->psnr[0].stat[i] = 0;
+      ppi->psnr[1].stat[i] = 0;
+
+      ppi->fastssim.stat[i] = 0;
+      ppi->psnrhvs.stat[i] = 0;
+    }
+
+    ppi->psnr[0].worst = 100.0;
+    ppi->psnr[1].worst = 100.0;
+    ppi->worst_ssim = 100.0;
+    ppi->worst_ssim_hbd = 100.0;
+
+    ppi->count[0] = 0;
+    ppi->count[1] = 0;
+    ppi->total_bytes = 0;
+
+    if (ppi->b_calculate_psnr) {
+      ppi->total_sq_error[0] = 0;
+      ppi->total_samples[0] = 0;
+      ppi->total_sq_error[1] = 0;
+      ppi->total_samples[1] = 0;
+      ppi->total_recode_hits = 0;
+      ppi->summed_quality = 0;
+      ppi->summed_weights = 0;
+      ppi->summed_quality_hbd = 0;
+      ppi->summed_weights_hbd = 0;
+    }
+
+    ppi->fastssim.worst = 100.0;
+    ppi->psnrhvs.worst = 100.0;
+
+    if (ppi->b_calculate_blockiness) {
+      ppi->total_blockiness = 0;
+      ppi->worst_blockiness = 0.0;
+    }
+
+    ppi->total_inconsistency = 0;
+    ppi->worst_consistency = 100.0;
+    if (ppi->b_calculate_consistency) {
+      AOM_CHECK_MEM_ERROR(&ppi->error, ppi->ssim_vars,
+                          aom_malloc(sizeof(*ppi->ssim_vars) * 4 *
+                                     mi_params.mi_rows * mi_params.mi_cols));
+    }
+#endif
+  }
+
+  ppi->error.setjmp = 0;
+  return ppi;
+}
+
+AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
+                                BufferPool *const pool, COMPRESSOR_STAGE stage,
+                                int lap_lag_in_frames) {
+  AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+  AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+  if (!cm) return NULL;
+
+  av1_zero(*cpi);
+
+  cpi->ppi = ppi;
+  cm->seq_params = &ppi->seq_params;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->error =
+      (struct aom_internal_error_info *)aom_calloc(1, sizeof(*cm->error));
+#else
+  cm->error = &ppi->error;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    av1_remove_compressor(cpi);
+    return 0;
+  }
+
+  cm->error->setjmp = 1;
+  cpi->compressor_stage = stage;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cpi->do_frame_data_update = true;
+#endif
+
+  CommonModeInfoParams *const mi_params = &cm->mi_params;
+  mi_params->free_mi = enc_free_mi;
+  mi_params->setup_mi = enc_setup_mi;
+  mi_params->set_mb_mi =
+      (oxcf->pass == AOM_RC_FIRST_PASS || cpi->compressor_stage == LAP_STAGE)
+          ? stat_stage_set_mb_mi
+          : enc_set_mb_mi;
+
+  mi_params->mi_alloc_bsize = BLOCK_4X4;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(
+      cm, cm->default_frame_context,
+      (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->default_frame_context)));
+  memset(cm->fc, 0, sizeof(*cm->fc));
+  memset(cm->default_frame_context, 0, sizeof(*cm->default_frame_context));
+
+  cpi->common.buffer_pool = pool;
+
+  init_config(cpi, oxcf);
+  if (cpi->compressor_stage == LAP_STAGE) {
+    cpi->oxcf.gf_cfg.lag_in_frames = lap_lag_in_frames;
+  }
+
+  av1_rc_init(&cpi->oxcf, &cpi->rc);
+
+  init_frame_info(&cpi->frame_info, cm);
+  init_frame_index_set(&cpi->frame_index_set);
+
+  cm->current_frame.frame_number = 0;
+  cm->current_frame_id = -1;
+  cpi->tile_data = NULL;
+  cpi->last_show_frame_buf = NULL;
+  realloc_segmentation_maps(cpi);
+
+  cpi->refresh_frame.alt_ref_frame = false;
+
+#if CONFIG_SPEED_STATS
+  cpi->tx_search_count = 0;
+#endif  // CONFIG_SPEED_STATS
+
+  cpi->time_stamps.first_ts_start = INT64_MAX;
+
+#ifdef OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+#ifdef OUTPUT_YUV_DENOISED
+  yuv_denoised_file = fopen("denoised.yuv", "wb");
+#endif
+
+#if !CONFIG_REALTIME_ONLY
+  if (is_stat_consumption_stage(cpi)) {
+    const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+    const int packets = (int)(oxcf->twopass_stats_in.sz / packet_sz);
+
+    if (!cpi->ppi->lap_enabled) {
+      /*Re-initialize to stats buffer, populated by application in the case of
+       * two pass*/
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_start =
+          oxcf->twopass_stats_in.buf;
+      cpi->twopass_frame.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end =
+          &cpi->ppi->twopass.stats_buf_ctx->stats_in_start[packets - 1];
+
+      // The buffer size is packets - 1 because the last packet is total_stats.
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info,
+                              oxcf->twopass_stats_in.buf, packets - 1);
+      av1_init_second_pass(cpi);
+    } else {
+      av1_firstpass_info_init(&cpi->ppi->twopass.firstpass_info, NULL, 0);
+      av1_init_single_pass_lap(cpi);
+    }
+  }
+#endif
+
+  alloc_obmc_buffers(&cpi->td.mb.obmc_buffer, cm->error);
+
+  for (int x = 0; x < 2; x++)
+    for (int y = 0; y < 2; y++)
+      CHECK_MEM_ERROR(
+          cm, cpi->td.mb.intrabc_hash_info.hash_value_buffer[x][y],
+          (uint32_t *)aom_malloc(
+              AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+              sizeof(*cpi->td.mb.intrabc_hash_info.hash_value_buffer[0][0])));
+
+  cpi->td.mb.intrabc_hash_info.g_crc_initialized = 0;
+
+  av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  CHECK_MEM_ERROR(cm, cpi->consec_zero_mv,
+                  aom_calloc((mi_params->mi_rows * mi_params->mi_cols) >> 2,
+                             sizeof(*cpi->consec_zero_mv)));
+
+  cpi->mb_weber_stats = NULL;
+  cpi->mb_delta_q = NULL;
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->ssim_rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->ssim_rdmult_scaling_factors)));
+  }
+
+#if CONFIG_TUNE_VMAF
+  {
+    const int bsize = BLOCK_64X64;
+    const int w = mi_size_wide[bsize];
+    const int h = mi_size_high[bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->vmaf_info.rdmult_scaling_factors,
+                    aom_calloc(num_rows * num_cols,
+                               sizeof(*cpi->vmaf_info.rdmult_scaling_factors)));
+    for (int i = 0; i < MAX_ARF_LAYERS; i++) {
+      cpi->vmaf_info.last_frame_unsharp_amount[i] = -1.0;
+      cpi->vmaf_info.last_frame_ysse[i] = -1.0;
+      cpi->vmaf_info.last_frame_vmaf[i] = -1.0;
+    }
+    cpi->vmaf_info.original_qindex = -1;
+    cpi->vmaf_info.vmaf_model = NULL;
+  }
+#endif
+
+#if CONFIG_TUNE_BUTTERAUGLI
+  {
+    const int w = mi_size_wide[butteraugli_rdo_bsize];
+    const int h = mi_size_high[butteraugli_rdo_bsize];
+    const int num_cols = (mi_params->mi_cols + w - 1) / w;
+    const int num_rows = (mi_params->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(
+        cm, cpi->butteraugli_info.rdmult_scaling_factors,
+        aom_malloc(num_rows * num_cols *
+                   sizeof(*cpi->butteraugli_info.rdmult_scaling_factors)));
+    memset(&cpi->butteraugli_info.source, 0,
+           sizeof(cpi->butteraugli_info.source));
+    memset(&cpi->butteraugli_info.resized_source, 0,
+           sizeof(cpi->butteraugli_info.resized_source));
+    cpi->butteraugli_info.recon_set = false;
+  }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS
+  av1_zero(cpi->partition_stats);
+#endif  // CONFIG_COLLECT_PARTITION_STATS
+
   /* av1_init_quantizer() is first called here. Add check in
    * av1_frame_init_quantizer() so that av1_init_quantizer is only
    * called later when needed. This will avoid unnecessary calls of
    * av1_init_quantizer() for every frame.
    */
   av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                     cm->seq_params.bit_depth);
+                     cm->seq_params->bit_depth);
   av1_qm_init(&cm->quant_params, av1_num_planes(cm));
 
   av1_loop_filter_init(cm);
@@ -1364,8 +1438,13 @@
 #if !CONFIG_REALTIME_ONLY
   av1_loop_restoration_precal();
 #endif
-  cm->error.setjmp = 0;
 
+  cpi->third_pass_ctx = NULL;
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS) {
+    av1_init_thirdpass_ctx(cm, &cpi->third_pass_ctx, NULL);
+  }
+
+  cm->error->setjmp = 0;
   return cpi;
 }
 
@@ -1379,32 +1458,33 @@
 // This function will change the state and free the mutex of corresponding
 // workers and terminate the object. The object can not be re-used unless a call
 // to reset() is made.
-static AOM_INLINE void terminate_worker_data(AV1_COMP *cpi) {
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
-  for (int t = mt_info->num_workers - 1; t >= 0; --t) {
-    AVxWorker *const worker = &mt_info->workers[t];
+static AOM_INLINE void terminate_worker_data(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  for (int t = p_mt_info->num_workers - 1; t >= 0; --t) {
+    AVxWorker *const worker = &p_mt_info->workers[t];
     aom_get_worker_interface()->end(worker);
   }
 }
 
 // Deallocate allocated thread_data.
-static AOM_INLINE void free_thread_data(AV1_COMP *cpi) {
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
-  AV1_COMMON *cm = &cpi->common;
-  for (int t = 0; t < mt_info->num_workers; ++t) {
-    EncWorkerData *const thread_data = &mt_info->tile_thr_data[t];
+static AOM_INLINE void free_thread_data(AV1_PRIMARY *ppi) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  for (int t = 1; t < p_mt_info->num_workers; ++t) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t];
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    thread_data->td = thread_data->original_td;
+#endif
     aom_free(thread_data->td->tctx);
-    if (t == 0) continue;
     aom_free(thread_data->td->palette_buffer);
     aom_free(thread_data->td->tmp_conv_dst);
     release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer);
     for (int j = 0; j < 2; ++j) {
       aom_free(thread_data->td->tmp_pred_bufs[j]);
     }
+    aom_free(thread_data->td->pixel_gradient_info);
     release_obmc_buffers(&thread_data->td->obmc_buffer);
     aom_free(thread_data->td->vt64x64);
 
-    aom_free(thread_data->td->inter_modes_info);
     for (int x = 0; x < 2; x++) {
       for (int y = 0; y < 2; y++) {
         aom_free(thread_data->td->hash_value_buffer[x][y]);
@@ -1412,7 +1492,8 @@
       }
     }
     aom_free(thread_data->td->counts);
-    av1_free_pmc(thread_data->td->firstpass_ctx, av1_num_planes(cm));
+    av1_free_pmc(thread_data->td->firstpass_ctx,
+                 ppi->seq_params.monochrome ? 1 : MAX_MB_PLANE);
     thread_data->td->firstpass_ctx = NULL;
     av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf);
     av1_free_sms_tree(thread_data->td);
@@ -1422,7 +1503,33 @@
 
 void av1_remove_primary_compressor(AV1_PRIMARY *ppi) {
   if (!ppi) return;
+  aom_free_frame_buffer(&ppi->alt_ref_buffer);
+  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+    aom_free(ppi->level_params.level_info[i]);
+  }
   av1_lookahead_destroy(ppi->lookahead);
+
+  aom_free(ppi->tpl_rdmult_scaling_factors);
+  ppi->tpl_rdmult_scaling_factors = NULL;
+  aom_free(ppi->tpl_sb_rdmult_scaling_factors);
+  ppi->tpl_sb_rdmult_scaling_factors = NULL;
+
+  TplParams *const tpl_data = &ppi->tpl_data;
+  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
+    aom_free(tpl_data->tpl_stats_pool[frame]);
+    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
+  }
+
+#if !CONFIG_REALTIME_ONLY
+  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
+#endif
+
+  terminate_worker_data(ppi);
+  free_thread_data(ppi);
+
+  aom_free(ppi->p_mt_info.tile_thr_data);
+  aom_free(ppi->p_mt_info.workers);
+
   aom_free(ppi);
 }
 
@@ -1431,127 +1538,6 @@
 
   AV1_COMMON *cm = &cpi->common;
   if (cm->current_frame.frame_number > 0) {
-#if CONFIG_ENTROPY_STATS
-    if (!is_stat_generation_stage(cpi)) {
-      fprintf(stderr, "Writing counts.stt\n");
-      FILE *f = fopen("counts.stt", "wb");
-      fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
-      fclose(f);
-    }
-#endif  // CONFIG_ENTROPY_STATS
-#if CONFIG_INTERNAL_STATS
-    aom_clear_system_state();
-
-    if (!is_stat_generation_stage(cpi)) {
-      char headings[512] = { 0 };
-      char results[512] = { 0 };
-      FILE *f = fopen("opsnr.stt", "a");
-      double time_encoded =
-          (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
-          10000000.000;
-      double total_encode_time =
-          (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
-      const double dr =
-          (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
-      const double peak =
-          (double)((1 << cpi->oxcf.input_cfg.input_bit_depth) - 1);
-      const double target_rate =
-          (double)cpi->oxcf.rc_cfg.target_bandwidth / 1000;
-      const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
-
-      if (cpi->b_calculate_psnr) {
-        const double total_psnr =
-            aom_sse_to_psnr((double)cpi->total_samples[0], peak,
-                            (double)cpi->total_sq_error[0]);
-        const double total_ssim =
-            100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-        snprintf(headings, sizeof(headings),
-                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                 "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
-                 "AVPsrnY\tAPsnrCb\tAPsnrCr");
-        snprintf(results, sizeof(results),
-                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f",
-                 dr, cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
-                 cpi->psnr[0].stat[STAT_ALL] / cpi->count[0], total_psnr,
-                 total_ssim, total_ssim,
-                 cpi->fastssim.stat[STAT_ALL] / cpi->count[0],
-                 cpi->psnrhvs.stat[STAT_ALL] / cpi->count[0],
-                 cpi->psnr[0].worst, cpi->worst_ssim, cpi->fastssim.worst,
-                 cpi->psnrhvs.worst, cpi->psnr[0].stat[STAT_Y] / cpi->count[0],
-                 cpi->psnr[0].stat[STAT_U] / cpi->count[0],
-                 cpi->psnr[0].stat[STAT_V] / cpi->count[0]);
-
-        if (cpi->b_calculate_blockiness) {
-          SNPRINT(headings, "\t  Block\tWstBlck");
-          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count[0]);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
-        }
-
-        if (cpi->b_calculate_consistency) {
-          double consistency =
-              aom_sse_to_psnr((double)cpi->total_samples[0], peak,
-                              (double)cpi->total_inconsistency);
-
-          SNPRINT(headings, "\tConsist\tWstCons");
-          SNPRINT2(results, "\t%7.3f", consistency);
-          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
-        }
-
-        SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
-        SNPRINT2(results, "\t%8.0f", total_encode_time);
-        SNPRINT2(results, " %7.2f", rate_err);
-        SNPRINT2(results, " %7.2f", fabs(rate_err));
-
-        SNPRINT(headings, "\tAPsnr611");
-        SNPRINT2(results, " %7.3f",
-                 (6 * cpi->psnr[0].stat[STAT_Y] + cpi->psnr[0].stat[STAT_U] +
-                  cpi->psnr[0].stat[STAT_V]) /
-                     (cpi->count[0] * 8));
-
-#if CONFIG_AV1_HIGHBITDEPTH
-        const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
-        const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
-        if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
-            (in_bit_depth < bit_depth)) {
-          const double peak_hbd = (double)((1 << bit_depth) - 1);
-          const double total_psnr_hbd =
-              aom_sse_to_psnr((double)cpi->total_samples[1], peak_hbd,
-                              (double)cpi->total_sq_error[1]);
-          const double total_ssim_hbd =
-              100 * pow(cpi->summed_quality_hbd / cpi->summed_weights_hbd, 8.0);
-          SNPRINT(headings,
-                  "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
-                  " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
-                  " AOMSSIMH VPSSIMPH WstSsimH");
-          SNPRINT2(results, "\t%7.3f",
-                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_ALL] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", total_psnr_hbd);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_Y] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_U] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f",
-                   cpi->psnr[1].stat[STAT_V] / cpi->count[1]);
-          SNPRINT2(results, "  %7.3f", cpi->psnr[1].worst);
-          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
-          SNPRINT2(results, "  %7.3f", total_ssim_hbd);
-          SNPRINT2(results, "  %7.3f", cpi->worst_ssim_hbd);
-        }
-#endif
-        fprintf(f, "%s\n", headings);
-        fprintf(f, "%s\n", results);
-      }
-
-      fclose(f);
-    }
-#endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
     if (!is_stat_generation_stage(cpi)) {
       fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
@@ -1570,21 +1556,15 @@
   av1_denoiser_free(&(cpi->denoiser));
 #endif
 
-  TplParams *const tpl_data = &cpi->tpl_data;
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    aom_free(tpl_data->tpl_stats_pool[frame]);
-    aom_free_frame_buffer(&tpl_data->tpl_rec_pool[frame]);
-  }
-
-  if (cpi->compressor_stage != LAP_STAGE) {
-    terminate_worker_data(cpi);
-    free_thread_data(cpi);
-  }
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  aom_free(cm->error);
+#endif
+  aom_free(cpi->td.tctx);
   MultiThreadInfo *const mt_info = &cpi->mt_info;
 #if CONFIG_MULTITHREAD
   pthread_mutex_t *const enc_row_mt_mutex_ = mt_info->enc_row_mt.mutex_;
   pthread_mutex_t *const gm_mt_mutex_ = mt_info->gm_sync.mutex_;
+  pthread_mutex_t *const pack_bs_mt_mutex_ = mt_info->pack_bs_sync.mutex_;
   if (enc_row_mt_mutex_ != NULL) {
     pthread_mutex_destroy(enc_row_mt_mutex_);
     aom_free(enc_row_mt_mutex_);
@@ -1593,36 +1573,32 @@
     pthread_mutex_destroy(gm_mt_mutex_);
     aom_free(gm_mt_mutex_);
   }
+  if (pack_bs_mt_mutex_ != NULL) {
+    pthread_mutex_destroy(pack_bs_mt_mutex_);
+    aom_free(pack_bs_mt_mutex_);
+  }
 #endif
   av1_row_mt_mem_dealloc(cpi);
-  if (cpi->compressor_stage != LAP_STAGE) {
-    aom_free(mt_info->tile_thr_data);
-    aom_free(mt_info->workers);
-  }
 
-#if !CONFIG_REALTIME_ONLY
-  av1_tpl_dealloc(&tpl_data->tpl_mt_sync);
-#endif
   if (mt_info->num_workers > 1) {
     av1_loop_filter_dealloc(&mt_info->lf_row_sync);
     av1_cdef_mt_dealloc(&mt_info->cdef_sync);
 #if !CONFIG_REALTIME_ONLY
-    av1_loop_restoration_dealloc(&mt_info->lr_row_sync,
-                                 mt_info->num_mod_workers[MOD_LR]);
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+    av1_loop_restoration_dealloc(&mt_info->lr_row_sync, num_lr_workers);
     av1_gm_dealloc(&mt_info->gm_sync);
     av1_tf_mt_dealloc(&mt_info->tf_sync);
 #endif
   }
 
+  av1_free_thirdpass_ctx(cpi->third_pass_ctx);
+
   dealloc_compressor_data(cpi);
 
-#if CONFIG_INTERNAL_STATS
-  aom_free(cpi->ssim_vars);
-  cpi->ssim_vars = NULL;
-#endif  // CONFIG_INTERNAL_STATS
+  av1_ext_part_delete(&cpi->ext_part_controller);
 
   av1_remove_common(cm);
-  av1_free_ref_frame_buffers(cm->buffer_pool);
 
   aom_free(cpi);
 
@@ -1780,7 +1756,12 @@
         mv_search_params->mv_step_param = av1_init_search_range(
             AOMMIN(max_mv_def, 2 * mv_search_params->max_mv_magnitude));
       }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Reset max_mv_magnitude for parallel frames based on update flag.
+      if (cpi->do_frame_data_update) mv_search_params->max_mv_magnitude = -1;
+#else
       mv_search_params->max_mv_magnitude = -1;
+#endif
     }
   }
 }
@@ -1788,14 +1769,14 @@
 void av1_set_screen_content_options(AV1_COMP *cpi, FeatureFlags *features) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  if (cm->seq_params.force_screen_content_tools != 2) {
+  if (cm->seq_params->force_screen_content_tools != 2) {
     features->allow_screen_content_tools = features->allow_intrabc =
-        cm->seq_params.force_screen_content_tools;
+        cm->seq_params->force_screen_content_tools;
     return;
   }
 
   if (cpi->oxcf.mode == REALTIME) {
-    assert(cm->seq_params.reduced_still_picture_hdr);
+    assert(cm->seq_params->reduced_still_picture_hdr);
     features->allow_screen_content_tools = features->allow_intrabc = 0;
     return;
   }
@@ -1813,7 +1794,7 @@
   const int stride = cpi->unfiltered_source->y_stride;
   const int width = cpi->unfiltered_source->y_width;
   const int height = cpi->unfiltered_source->y_height;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   const int blk_w = 16;
   const int blk_h = 16;
   // These threshold values are selected experimentally.
@@ -1959,7 +1940,7 @@
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
 
   if (!initial_dimensions->width ||
@@ -1993,11 +1974,11 @@
   if (cpi->oxcf.noise_sensitivity > 0 &&
       !cpi->denoiser.frame_buffer_initialized) {
     if (av1_denoiser_alloc(
-            cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+            cm, &cpi->svc, &cpi->denoiser, cpi->ppi->use_svc,
             cpi->oxcf.noise_sensitivity, cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+            cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS))
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate denoiser");
   }
 }
@@ -2007,9 +1988,9 @@
 int av1_set_size_literal(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *cm = &cpi->common;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
-  av1_check_initial_width(cpi, cm->seq_params.use_highbitdepth,
-                          cm->seq_params.subsampling_x,
-                          cm->seq_params.subsampling_y);
+  av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth,
+                          cm->seq_params->subsampling_x,
+                          cm->seq_params->subsampling_y);
 
   if (width <= 0 || height <= 0) return 1;
 
@@ -2039,7 +2020,7 @@
 
 void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int ref_frame;
@@ -2077,7 +2058,7 @@
     if (av1_alloc_above_context_buffers(above_contexts, cm->tiles.rows,
                                         cm->mi_params.mi_cols,
                                         av1_num_planes(cm)))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate context buffers");
   }
 
@@ -2087,11 +2068,13 @@
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
           NULL, cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
 
+  if (!is_stat_generation_stage(cpi)) av1_init_cdef_worker(cpi);
+
 #if !CONFIG_REALTIME_ONLY
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
   if (use_restoration) {
@@ -2104,8 +2087,13 @@
       cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
 
     av1_alloc_restoration_buffers(cm);
+    // Store the allocated restoration buffers in MT object.
+    if (cpi->ppi->p_mt_info.num_workers > 1) {
+      av1_init_lr_mt_buffers(cpi);
+    }
   }
 #endif
+
   if (!is_stat_generation_stage(cpi)) alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
@@ -2144,13 +2132,22 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
     start_timing(cpi, cdef_time);
 #endif
+    const int num_workers = cpi->mt_info.num_mod_workers[MOD_CDEF];
     // Find CDEF parameters
     av1_cdef_search(&cpi->mt_info, &cm->cur_frame->buf, cpi->source, cm, xd,
-                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult);
+                    cpi->sf.lpf_sf.cdef_pick_method, cpi->td.mb.rdmult,
+                    cpi->sf.rt_sf.skip_cdef_sb, cpi->rc.frames_since_key);
 
     // Apply the filter
-    if (!cpi->sf.rt_sf.skip_loopfilter_non_reference)
-      av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+    if (!cpi->sf.rt_sf.skip_loopfilter_non_reference) {
+      if (num_workers > 1) {
+        av1_cdef_frame_mt(cm, xd, cpi->mt_info.cdef_worker,
+                          cpi->mt_info.workers, &cpi->mt_info.cdef_sync,
+                          num_workers, av1_cdef_init_fb_row_mt);
+      } else {
+        av1_cdef_frame(&cm->cur_frame->buf, cm, xd, av1_cdef_init_fb_row);
+      }
+    }
 #if CONFIG_COLLECT_COMPONENT_TIMING
     end_timing(cpi, cdef_time);
 #endif
@@ -2210,11 +2207,19 @@
 
   const int use_loopfilter =
       !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_cdef = cm->seq_params.enable_cdef &&
+  const int use_cdef = cm->seq_params->enable_cdef &&
                        !cm->features.coded_lossless && !cm->tiles.large_scale;
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
+  const int cur_width = cm->cur_frame->width;
+  const int cur_height = cm->cur_frame->height;
+  const int cur_width_mib = cm->mi_params.mi_cols * MI_SIZE;
+  const int cur_height_mib = cm->mi_params.mi_rows * MI_SIZE;
+  const int is_realtime =
+      cpi->sf.rt_sf.use_nonrd_pick_mode && !(cm->mi_params.mi_cols % 2) &&
+      !(cm->mi_params.mi_rows % 2) && (cur_width_mib - cur_width < MI_SIZE) &&
+      (cur_height_mib - cur_height < MI_SIZE);
 
   struct loopfilter *lf = &cm->lf;
 
@@ -2222,7 +2227,6 @@
   start_timing(cpi, loop_filter_time);
 #endif
   if (use_loopfilter) {
-    aom_clear_system_state();
     av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_sf.lpf_pick);
   } else {
     lf->filter_level[0] = 0;
@@ -2231,19 +2235,9 @@
 
   if ((lf->filter_level[0] || lf->filter_level[1]) &&
       !cpi->sf.rt_sf.skip_loopfilter_non_reference) {
-    if (num_workers > 1)
-      av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
-#if CONFIG_LPF_MASK
-                               0,
-#endif
-                               mt_info->workers, num_workers,
-                               &mt_info->lf_row_sync);
-    else
-      av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
-#if CONFIG_LPF_MASK
-                            0,
-#endif
-                            0, num_planes, 0);
+    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
+                             mt_info->workers, num_workers,
+                             &mt_info->lf_row_sync, is_realtime);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_filter_time);
@@ -2267,26 +2261,22 @@
   AV1_COMMON *const cm = &cpi->common;
   const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg;
   SVC *const svc = &cpi->svc;
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
 
   int top_index = 0, bottom_index = 0, q = 0;
   YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source;
   InterpFilter filter_scaler =
-      cpi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
-                   : EIGHTTAP_SMOOTH;
-  int phase_scaler =
-      cpi->use_svc ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0;
+      cpi->ppi->use_svc ? svc->downsample_filter_type[svc->spatial_layer_id]
+                        : EIGHTTAP_SMOOTH;
+  int phase_scaler = cpi->ppi->use_svc
+                         ? svc->downsample_filter_phase[svc->spatial_layer_id]
+                         : 0;
 
   set_size_independent_vars(cpi);
   av1_setup_frame_size(cpi);
   av1_set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
-  if (!cpi->use_svc) {
+  if (!cpi->ppi->use_svc) {
     phase_scaler = 8;
     // 2:1 scaling.
     if ((cm->width << 1) == unscaled->y_crop_width &&
@@ -2305,10 +2295,19 @@
     }
   }
 
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (sf->intra_sf.intra_pruning_with_hog ||
+      sf->intra_sf.chroma_intra_pruning_with_hog) {
+    allocate_gradient_info_for_hog(&cpi->td.pixel_gradient_info, cpi);
+  }
+
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
     variance_partition_alloc(cpi);
 
-  if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
+  if (cm->current_frame.frame_type == KEY_FRAME ||
+      ((sf->inter_sf.extra_prune_warped &&
+        cm->current_frame.refresh_frame_flags & (1 << GOLDEN_FRAME))))
+    copy_frame_prob_info(cpi);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   printf("\n Encoding a frame:");
@@ -2320,8 +2319,6 @@
   }
 #endif
 
-  aom_clear_system_state();
-
   cpi->source = av1_scale_if_required(cm, unscaled, &cpi->scaled_source,
                                       filter_scaler, phase_scaler, true, false);
   if (frame_is_intra_only(cm) || resize_pending != 0) {
@@ -2341,7 +2338,7 @@
   }
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc)
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->ppi->use_svc)
     av1_denoiser_reset_on_first_frame(cpi);
 #endif
 
@@ -2366,19 +2363,25 @@
     }
   }
 
-  // For SVC the inter-layer/spatial prediction is not done for newmv
-  // (zero_mode is forced), and since the scaled references are only
-  // use for newmv search, we can avoid scaling here.
-  if (!frame_is_intra_only(cm) &&
-      !(cpi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
-    av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+  {
+#endif
+    // For SVC the inter-layer/spatial prediction is not done for newmv
+    // (zero_mode is forced), and since the scaled references are only
+    // use for newmv search, we can avoid scaling here.
+    if (!frame_is_intra_only(cm) &&
+        !(cpi->ppi->use_svc && cpi->svc.force_zero_mode_spatial_ref))
+      av1_scale_references(cpi, filter_scaler, phase_scaler, 1);
+  }
 
   av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                    q_cfg->enable_chroma_deltaq);
+                    q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
   av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
   if ((q_cfg->deltaq_mode != NO_DELTA_Q) || q_cfg->enable_chroma_deltaq)
     av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                       cm->seq_params.bit_depth);
+                       cm->seq_params->bit_depth);
   av1_set_variance_partition_thresholds(cpi, q, 0);
   av1_setup_frame(cpi);
 
@@ -2389,11 +2392,11 @@
       cpi->rc.high_source_sad) {
     if (av1_encodedframe_overshoot_cbr(cpi, &q)) {
       av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                        q_cfg->enable_chroma_deltaq);
+                        q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
       av1_set_speed_features_qindex_dependent(cpi, cpi->oxcf.speed);
       if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
         av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                           cm->seq_params.bit_depth);
+                           cm->seq_params->bit_depth);
       av1_set_variance_partition_thresholds(cpi, q, 0);
       if (frame_is_intra_only(cm) || cm->features.error_resilient_mode)
         av1_setup_frame(cpi);
@@ -2429,6 +2432,13 @@
   // transform / motion compensation build reconstruction frame
   av1_encode_frame(cpi);
 
+  // Adjust the refresh of the golden (longer-term) reference based on QP
+  // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode.
+  if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR &&
+      cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 &&
+      svc->number_temporal_layers == 1)
+    av1_adjust_gf_refresh_qp_one_pass_rt(cpi);
+
   // Update some stats from cyclic refresh.
   if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ && !frame_is_intra_only(cm))
     av1_cyclic_refresh_postencode(cpi);
@@ -2437,11 +2447,9 @@
   end_timing(cpi, av1_encode_frame_time);
 #endif
 #if CONFIG_INTERNAL_STATS
-  ++cpi->tot_recode_hits;
+  ++cpi->frame_recode_hits;
 #endif
 
-  aom_clear_system_state();
-
   return AOM_CODEC_OK;
 }
 
@@ -2496,7 +2504,13 @@
   q_low = bottom_index;
   q_high = top_index;
 
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION)
+  const SPEED_FEATURES *sf = &cpi->sf;
+  if (sf->intra_sf.intra_pruning_with_hog ||
+      sf->intra_sf.chroma_intra_pruning_with_hog) {
+    allocate_gradient_info_for_hog(&cpi->td.pixel_gradient_info, cpi);
+  }
+
+  if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION)
     variance_partition_alloc(cpi);
 
   if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
@@ -2505,9 +2519,11 @@
   printf("\n Encoding a frame:");
 #endif
 
+#if !CONFIG_RD_COMMAND
   // Determine whether to use screen content tools using two fast encoding.
   if (!cpi->sf.hl_sf.disable_extra_sc_testing)
     av1_determine_sc_tools_with_encoding(cpi, q);
+#endif  // !CONFIG_RD_COMMAND
 
 #if CONFIG_TUNE_VMAF
   if (oxcf->tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) {
@@ -2520,6 +2536,9 @@
   int original_q = 0;
 #endif
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cpi->num_frame_recode = 0;
+#endif
   // Loop variables
   int loop = 0;
   int loop_count = 0;
@@ -2530,7 +2549,7 @@
 
   do {
     loop = 0;
-    aom_clear_system_state();
+    int do_mv_stats_collection = 1;
 
     // if frame was scaled calculate global_motion_search again if already
     // done
@@ -2564,11 +2583,17 @@
           EIGHTTAP_REGULAR, 0, false, false);
     }
 
-    if (!frame_is_intra_only(cm)) {
-      if (loop_count > 0) {
-        release_scaled_references(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+    {
+#endif
+      if (!frame_is_intra_only(cm)) {
+        if (loop_count > 0) {
+          release_scaled_references(cpi);
+        }
+        av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
       }
-      av1_scale_references(cpi, EIGHTTAP_REGULAR, 0, 0);
     }
 
 #if CONFIG_TUNE_VMAF
@@ -2579,13 +2604,26 @@
     }
 #endif
 
+#if CONFIG_RD_COMMAND
+    RD_COMMAND *rd_command = &cpi->rd_command;
+    RD_OPTION option = rd_command->option_ls[rd_command->frame_index];
+    if (option == RD_OPTION_SET_Q || option == RD_OPTION_SET_Q_RDMULT) {
+      q = rd_command->q_index_ls[rd_command->frame_index];
+    }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+    if (cpi->vbr_rc_info.q_index_list_ready) {
+      q = cpi->vbr_rc_info.q_index_list[cpi->gf_frame_index];
+    }
+#endif
     av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel, q,
-                      q_cfg->enable_chroma_deltaq);
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
     av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
 
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+                         cm->seq_params->bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q, 0);
 
@@ -2641,14 +2679,23 @@
     // transform / motion compensation build reconstruction frame
     av1_encode_frame(cpi);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Disable mv_stats collection for parallel frames based on update flag.
+    if (!cpi->do_frame_data_update) do_mv_stats_collection = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     // Reset the mv_stats in case we are interrupted by an intraframe or an
     // overlay frame.
-    if (cpi->mv_stats.valid) {
+    if (cpi->ppi->mv_stats.valid && do_mv_stats_collection) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
       av1_zero(cpi->mv_stats);
+#else
+      av1_zero(cpi->ppi->mv_stats);
+#endif
     }
     // Gather the mv_stats for the next frame
     if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-        av1_frame_allows_smart_mv(cpi)) {
+        av1_frame_allows_smart_mv(cpi) && do_mv_stats_collection) {
       av1_collect_mv_stats(cpi, q);
     }
 
@@ -2656,8 +2703,9 @@
     end_timing(cpi, av1_encode_frame_time);
 #endif
 
-    aom_clear_system_state();
-
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    const int do_dummy_pack = 1;
+#else   // CONFIG_BITRATE_ACCURACY
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
@@ -2665,6 +2713,7 @@
         (cpi->sf.hl_sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
          oxcf->rc_cfg.mode != AOM_Q) ||
         oxcf->rc_cfg.min_cr > 0;
+#endif  // CONFIG_BITRATE_ACCURACY
     if (do_dummy_pack) {
       av1_finalize_encoded_frame(cpi);
       int largest_tile_id = 0;  // Output from bitstream: unused here
@@ -2674,7 +2723,42 @@
         return AOM_CODEC_ERROR;
       }
 
+#if CONFIG_BITRATE_ACCURACY
+      cpi->vbr_rc_info.actual_coeff_bitrate_byframe[cpi->gf_frame_index] =
+          rc->coefficient_size;
+#endif
+
+      // bits used for this frame
       rc->projected_frame_size = (int)(*size) << 3;
+#if CONFIG_RD_COMMAND
+      PSNR_STATS psnr;
+      aom_calc_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr);
+      printf("q %d rdmult %d rate %d dist %" PRIu64 "\n", q, cpi->rd.RDMULT,
+             rc->projected_frame_size, psnr.sse[0]);
+      ++rd_command->frame_index;
+      if (rd_command->frame_index == rd_command->frame_count) {
+        exit(0);
+      }
+#endif  // CONFIG_RD_COMMAND
+
+#if CONFIG_BITRATE_ACCURACY
+      cpi->vbr_rc_info.actual_bitrate_byframe[cpi->gf_frame_index] =
+          rc->projected_frame_size;
+      cpi->vbr_rc_info.actual_mv_bitrate_byframe[cpi->gf_frame_index] =
+          rc->projected_frame_size -
+          cpi->vbr_rc_info.actual_coeff_bitrate_byframe[cpi->gf_frame_index];
+      cpi->ppi->tpl_data.actual_gop_bitrate += rc->projected_frame_size;
+      if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+        vbr_rc_set_keyframe_bitrate(&cpi->vbr_rc_info,
+                                    rc->projected_frame_size);
+      }
+
+#if 0
+      vbr_rc_info_log(&cpi->vbr_rc_info, cpi->gf_frame_index,
+                      cpi->ppi->gf_group.size, cpi->ppi->gf_group.update_type);
+#endif
+
+#endif
     }
 
 #if CONFIG_TUNE_VMAF
@@ -2697,11 +2781,20 @@
     }
 #endif
 
+#if CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+    loop = 0;  // turn off recode loop when CONFIG_BITRATE_ACCURACY is on
+#endif         // CONFIG_BITRATE_ACCURACY || CONFIG_RD_COMMAND
+
     if (loop) {
       ++loop_count;
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      cpi->num_frame_recode =
+          (cpi->num_frame_recode < (NUM_RECODES_PER_FRAME - 1))
+              ? (cpi->num_frame_recode + 1)
+              : (NUM_RECODES_PER_FRAME - 1);
+#endif
 #if CONFIG_INTERNAL_STATS
-      ++cpi->tot_recode_hits;
+      ++cpi->frame_recode_hits;
 #endif
     }
 #if CONFIG_COLLECT_COMPONENT_TIMING
@@ -2766,8 +2859,18 @@
                                               int64_t *rate,
                                               int *largest_tile_id) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  start_timing(cpi, encode_with_recode_loop_time);
+  start_timing(cpi, encode_with_or_without_recode_time);
 #endif
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  for (int i = 0; i < NUM_RECODES_PER_FRAME; i++) {
+    cpi->do_update_frame_probs_txtype[i] = 0;
+    cpi->do_update_frame_probs_obmc[i] = 0;
+    cpi->do_update_frame_probs_warp[i] = 0;
+    cpi->do_update_frame_probs_interpfilter[i] = 0;
+  }
+  cpi->do_update_vbr_bits_off_target_fast = 0;
+#endif
+
   int err;
 #if CONFIG_REALTIME_ONLY
   err = encode_without_recode(cpi);
@@ -2778,7 +2881,7 @@
     err = encode_with_recode_loop(cpi, size, dest);
 #endif
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  end_timing(cpi, encode_with_recode_loop_time);
+  end_timing(cpi, encode_with_or_without_recode_time);
 #endif
   if (err != AOM_CODEC_OK) {
     if (err == -1) {
@@ -2801,12 +2904,12 @@
 #endif
 
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
   // the force key frame
-  if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+  if (cpi->ppi->p_rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_AV1_HIGHBITDEPTH
     if (seq_params->use_highbitdepth) {
       cpi->ambient_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
@@ -2889,7 +2992,7 @@
                                             uint8_t *dest,
                                             int *largest_tile_id) {
   const AV1_COMMON *const cm = &cpi->common;
-  assert(cm->seq_params.enable_superres);
+  assert(cm->seq_params->enable_superres);
   assert(av1_superres_in_recode_allowed(cpi));
   aom_codec_err_t err = AOM_CODEC_OK;
   av1_save_all_coding_context(cpi);
@@ -2901,6 +3004,10 @@
   int64_t rate2 = INT64_MAX;
   int largest_tile_id2;
   double proj_rdcost1 = DBL_MAX;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  const aom_bit_depth_t bit_depth = cm->seq_params->bit_depth;
 
   // Encode with superres.
   if (cpi->sf.hl_sf.superres_auto_search_type == SUPERRES_AUTO_ALL) {
@@ -2909,9 +3016,7 @@
     int64_t superres_rates[SCALE_NUMERATOR];
     int superres_largest_tile_ids[SCALE_NUMERATOR];
     // Use superres for Key-frames and Alt-ref frames only.
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    if (gf_group->update_type[cpi->gf_frame_index] != OVERLAY_UPDATE &&
-        gf_group->update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE) {
+    if (update_type != OVERLAY_UPDATE && update_type != INTNL_OVERLAY_UPDATE) {
       for (int denom = SCALE_NUMERATOR + 1; denom <= 2 * SCALE_NUMERATOR;
            ++denom) {
         superres_cfg->superres_scale_denominator = denom;
@@ -2945,8 +3050,8 @@
     if (err != AOM_CODEC_OK) return err;
 
     // Note: Both use common rdmult based on base qindex of fullres.
-    const int64_t rdmult =
-        av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
 
     // Find the best rdcost among all superres denoms.
     int best_denom = -1;
@@ -2957,7 +3062,7 @@
       const int64_t this_rate = superres_rates[this_index];
       const int this_largest_tile_id = superres_largest_tile_ids[this_index];
       const double this_rdcost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-          rdmult, this_rate, this_sse, cm->seq_params.bit_depth);
+          rdmult, this_rate, this_sse, bit_depth);
       if (this_rdcost < proj_rdcost1) {
         sse1 = this_sse;
         rate1 = this_rate;
@@ -2966,8 +3071,8 @@
         best_denom = denom;
       }
     }
-    const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
     // Re-encode with superres if it's better.
     if (proj_rdcost1 < proj_rdcost2) {
       restore_all_coding_context(cpi);
@@ -3009,12 +3114,12 @@
     if (err != AOM_CODEC_OK) return err;
 
     // Note: Both use common rdmult based on base qindex of fullres.
-    const int64_t rdmult =
-        av1_compute_rd_mult_based_on_qindex(cpi, cm->quant_params.base_qindex);
-    proj_rdcost1 = RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1,
-                                                  cm->seq_params.bit_depth);
-    const double proj_rdcost2 = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        rdmult, rate2, sse2, cm->seq_params.bit_depth);
+    const int64_t rdmult = av1_compute_rd_mult_based_on_qindex(
+        bit_depth, update_type, cm->quant_params.base_qindex);
+    proj_rdcost1 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate1, sse1, bit_depth);
+    const double proj_rdcost2 =
+        RDCOST_DBL_WITH_NATIVE_BD_DIST(rdmult, rate2, sse2, bit_depth);
     // Re-encode with superres if it's better.
     if (proj_rdcost1 < proj_rdcost2) {
       restore_all_coding_context(cpi);
@@ -3039,6 +3144,69 @@
   return err;
 }
 
+#if !CONFIG_REALTIME_ONLY
+static void subtract_stats(FIRSTPASS_STATS *section,
+                           const FIRSTPASS_STATS *frame) {
+  section->frame -= frame->frame;
+  section->weight -= frame->weight;
+  section->intra_error -= frame->intra_error;
+  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+  section->coded_error -= frame->coded_error;
+  section->sr_coded_error -= frame->sr_coded_error;
+  section->pcnt_inter -= frame->pcnt_inter;
+  section->pcnt_motion -= frame->pcnt_motion;
+  section->pcnt_second_ref -= frame->pcnt_second_ref;
+  section->pcnt_neutral -= frame->pcnt_neutral;
+  section->intra_skip_pct -= frame->intra_skip_pct;
+  section->inactive_zone_rows -= frame->inactive_zone_rows;
+  section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->MVr -= frame->MVr;
+  section->mvr_abs -= frame->mvr_abs;
+  section->MVc -= frame->MVc;
+  section->mvc_abs -= frame->mvc_abs;
+  section->MVrv -= frame->MVrv;
+  section->MVcv -= frame->MVcv;
+  section->mv_in_out_count -= frame->mv_in_out_count;
+  section->new_mv_count -= frame->new_mv_count;
+  section->count -= frame->count;
+  section->duration -= frame->duration;
+}
+
+static void calculate_frame_avg_haar_energy(AV1_COMP *cpi) {
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const FIRSTPASS_STATS *const total_stats =
+      twopass->stats_buf_ctx->total_stats;
+
+  if (is_one_pass_rt_params(cpi) ||
+      (cpi->oxcf.q_cfg.deltaq_mode != DELTA_Q_PERCEPTUAL) ||
+      (is_fp_wavelet_energy_invalid(total_stats) == 0))
+    return;
+
+  const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const YV12_BUFFER_CONFIG *const unfiltered_source = cpi->unfiltered_source;
+  const uint8_t *const src = unfiltered_source->y_buffer;
+  const int hbd = unfiltered_source->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int stride = unfiltered_source->y_stride;
+  const BLOCK_SIZE fp_block_size =
+      get_fp_block_size(cpi->is_screen_content_type);
+  const int fp_block_size_width = block_size_wide[fp_block_size];
+  const int fp_block_size_height = block_size_high[fp_block_size];
+  const int num_unit_cols =
+      get_num_blocks(unfiltered_source->y_crop_width, fp_block_size_width);
+  const int num_unit_rows =
+      get_num_blocks(unfiltered_source->y_crop_height, fp_block_size_height);
+  const int num_8x8_cols = num_unit_cols * (fp_block_size_width / 8);
+  const int num_8x8_rows = num_unit_rows * (fp_block_size_height / 8);
+  int64_t frame_avg_wavelet_energy = av1_haar_ac_sad_mxn_uint8_input(
+      src, stride, hbd, num_8x8_rows, num_8x8_cols);
+
+  cpi->twopass_frame.frame_avg_haar_energy =
+      log(((double)frame_avg_wavelet_energy / num_mbs) + 1.0);
+}
+#endif
+
 extern void av1_print_frame_contexts(const FRAME_CONTEXT *fc,
                                      const char *filename);
 
@@ -3060,7 +3228,7 @@
 static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                      uint8_t *dest) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
@@ -3075,6 +3243,10 @@
     av1_set_screen_content_options(cpi, features);
   }
 
+#if !CONFIG_REALTIME_ONLY
+  calculate_frame_avg_haar_energy(cpi);
+#endif
+
   // frame type has been decided outside of this function call
   cm->cur_frame->frame_type = current_frame->frame_type;
 
@@ -3093,7 +3265,7 @@
   cpi->last_frame_type = current_frame->frame_type;
 
   if (frame_is_sframe(cm)) {
-    GF_GROUP *gf_group = &cpi->gf_group;
+    GF_GROUP *gf_group = &cpi->ppi->gf_group;
     // S frame will wipe out any previously encoded altref so we cannot place
     // an overlay frame
     gf_group->update_type[gf_group->size] = GF_UPDATE;
@@ -3115,8 +3287,6 @@
         cm->ref_frame_id[i] = display_frame_id;
     }
 
-    cpi->ppi->seq_params_locked = 1;
-
 #if DUMP_RECON_FRAMES == 1
     // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
     av1_dump_filtered_recon_frames(cpi);
@@ -3130,12 +3300,9 @@
     av1_denoiser_update_ref_frame(cpi);
 #endif
 
-    refresh_reference_frames(cpi);
-
     // Since we allocate a spot for the OVERLAY frame in the gf group, we need
     // to do post-encoding update accordingly.
     av1_set_target_rate(cpi, cm->width, cm->height);
-    av1_rc_postencode_update(cpi, *size);
 
     if (is_psnr_calc_enabled(cpi)) {
       cpi->source =
@@ -3152,7 +3319,7 @@
   if (!is_stat_generation_stage(cpi) &&
       cpi->common.features.allow_screen_content_tools &&
       !frame_is_intra_only(cm)) {
-    if (cpi->common.seq_params.force_integer_mv == 2) {
+    if (cpi->common.seq_params->force_integer_mv == 2) {
       // Adaptive mode: see what previous frame encoded did
       if (cpi->unscaled_last_source != NULL) {
         features->cur_frame_force_integer_mv = av1_is_integer_mv(
@@ -3162,7 +3329,7 @@
       }
     } else {
       cpi->common.features.cur_frame_force_integer_mv =
-          cpi->common.seq_params.force_integer_mv;
+          cpi->common.seq_params->force_integer_mv;
     }
   } else {
     cpi->common.features.cur_frame_force_integer_mv = 0;
@@ -3198,6 +3365,7 @@
       av1_setup_frame_size(cpi);
       av1_rc_postencode_update_drop_frame(cpi);
       release_scaled_references(cpi);
+      cpi->is_dropped_frame = true;
       return AOM_CODEC_OK;
     }
   }
@@ -3214,7 +3382,15 @@
   }
 #endif
 
-  aom_clear_system_state();
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI) {
+    av1_init_mb_wiener_var_buffer(cpi);
+    av1_set_mb_wiener_variance(cpi);
+  }
+
+  if (cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED) {
+    av1_init_mb_ur_var_buffer(cpi);
+    av1_set_mb_ur_variance(cpi);
+  }
 
 #if CONFIG_INTERNAL_STATS
   memset(cpi->mode_chosen_counts, 0,
@@ -3269,7 +3445,8 @@
       } else {
         if (cpi->svc.number_spatial_layers == 1 &&
             cpi->svc.number_temporal_layers == 1)
-          features->disable_cdf_update = cm->current_frame.frame_number & 1;
+          features->disable_cdf_update =
+              !((cm->current_frame.frame_number % 2) == 0);
         else if (cpi->svc.number_temporal_layers > 1)
           // Disable only on top temporal enhancement layer for now.
           features->disable_cdf_update = (cpi->svc.temporal_layer_id ==
@@ -3277,7 +3454,18 @@
       }
       break;
   }
-  seq_params->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Disable cdf update for the INTNL_ARF_UPDATE frame with
+  // frame_parallel_level 1.
+  if (!cpi->do_frame_data_update &&
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+    assert(cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1);
+    features->disable_cdf_update = 1;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
   int largest_tile_id = 0;
   if (av1_superres_in_recode_allowed(cpi)) {
@@ -3295,8 +3483,6 @@
     cpi->superres_mode = orig_superres_mode;  // restore
   }
 
-  cpi->ppi->seq_params_locked = 1;
-
   // Update reference frame ids for reference frames this frame will overwrite
   if (seq_params->frame_id_numbers_present_flag) {
     for (int i = 0; i < REF_FRAMES; i++) {
@@ -3324,8 +3510,14 @@
     }
   }
 
-  if (frame_is_intra_only(cm) == 0) {
-    release_scaled_references(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 0) {
+#else
+  {
+#endif
+    if (frame_is_intra_only(cm) == 0) {
+      release_scaled_references(cpi);
+    }
   }
 #if CONFIG_AV1_TEMPORAL_DENOISING
   av1_denoiser_update_ref_frame(cpi);
@@ -3335,12 +3527,6 @@
   //       for the purpose to verify no mismatch between encoder and decoder.
   if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
 
-  refresh_reference_frames(cpi);
-
-#if CONFIG_ENTROPY_STATS
-  av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
-#endif  // CONFIG_ENTROPY_STATS
-
   if (features->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
     *cm->fc = cpi->tile_data[largest_tile_id].tctx;
     av1_reset_cdf_symbol_counters(cm->fc);
@@ -3363,8 +3549,6 @@
 
   cpi->last_frame_type = current_frame->frame_type;
 
-  av1_rc_postencode_update(cpi, *size);
-
   // Clear the one shot update flags for segmentation map and mode/ref loop
   // filter deltas.
   cm->seg.update_map = 0;
@@ -3413,7 +3597,8 @@
   memcpy(&cpi->refresh_frame, &frame_params->refresh_frame,
          sizeof(cpi->refresh_frame));
 
-  if (current_frame->frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
+  if (current_frame->frame_type == KEY_FRAME &&
+      cpi->ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
     current_frame->frame_number = 0;
   }
 
@@ -3422,13 +3607,20 @@
 
   current_frame->display_order_hint = current_frame->order_hint;
   current_frame->order_hint %=
-      (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+      (1 << (cm->seq_params->order_hint_info.order_hint_bits_minus_1 + 1));
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  current_frame->pyramid_level = get_true_pyr_level(
+      cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index],
+      current_frame->display_order_hint, cpi->ppi->gf_group.max_layer_depth);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
   if (is_stat_generation_stage(cpi)) {
 #if !CONFIG_REALTIME_ONLY
     av1_first_pass(cpi, frame_input->ts_duration);
 #endif
-  } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
+  } else if (cpi->oxcf.pass == AOM_RC_ONE_PASS ||
+             cpi->oxcf.pass >= AOM_RC_SECOND_PASS) {
     if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
         AOM_CODEC_OK) {
       return AOM_CODEC_ERROR;
@@ -3447,9 +3639,9 @@
   AV1_COMMON *const cm = &cpi->common;
   if (!cpi->denoise_and_model) {
     cpi->denoise_and_model = aom_denoise_and_model_alloc(
-        cm->seq_params.bit_depth, block_size, noise_level);
+        cm->seq_params->bit_depth, block_size, noise_level);
     if (!cpi->denoise_and_model) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating denoise and model");
       return -1;
     }
@@ -3457,7 +3649,7 @@
   if (!cpi->film_grain_table) {
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     if (!cpi->film_grain_table) {
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Error allocating grain table");
       return -1;
     }
@@ -3479,7 +3671,7 @@
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
@@ -3521,7 +3713,7 @@
     res = -1;
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&timer);
-  cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+  cpi->ppi->total_time_receive_data += aom_usec_timer_elapsed(&timer);
 #endif
 
   // Note: Regarding profile setting, the following checks are added to help
@@ -3533,20 +3725,20 @@
   // header.
   if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
       (subsampling_x != 1 || subsampling_y != 1)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Non-4:2:0 color format requires profile 1 or 2");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_1) &&
       !(subsampling_x == 0 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 1 requires 4:4:4 color format");
     res = -1;
   }
   if ((seq_params->profile == PROFILE_2) &&
       (seq_params->bit_depth <= AOM_BITS_10) &&
       !(subsampling_x == 1 && subsampling_y == 0)) {
-    aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+    aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM,
                        "Profile 2 bit-depth <= 10 requires 4:2:2 color format");
     res = -1;
   }
@@ -3554,6 +3746,20 @@
   return res;
 }
 
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    fprintf(stderr, "Writing counts.stt\n");
+    FILE *f = fopen("counts.stt", "wb");
+    fwrite(&ppi->aggregate_fc, sizeof(ppi->aggregate_fc), 1, f);
+    fclose(f);
+  }
+}
+#endif  // CONFIG_ENTROPY_STATS
+
 #if CONFIG_INTERNAL_STATS
 extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
                                  const unsigned char *img2, int img2_pitch,
@@ -3569,12 +3775,13 @@
 }
 
 static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
   AV1_COMMON *const cm = &cpi->common;
   double samples = 0.0;
   const uint32_t in_bit_depth = cpi->oxcf.input_cfg.input_bit_depth;
   const uint32_t bit_depth = cpi->td.mb.e_mbd.bd;
 
-  if (cpi->use_svc &&
+  if (cpi->ppi->use_svc &&
       cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
     return;
 
@@ -3587,47 +3794,42 @@
     const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
     double y, u, v, frame_all;
 
-    cpi->count[0]++;
-    cpi->count[1]++;
-    if (cpi->b_calculate_psnr) {
+    ppi->count[0]++;
+    ppi->count[1]++;
+    if (cpi->ppi->b_calculate_psnr) {
       PSNR_STATS psnr;
       double weight[2] = { 0.0, 0.0 };
       double frame_ssim2[2] = { 0.0, 0.0 };
-      aom_clear_system_state();
 #if CONFIG_AV1_HIGHBITDEPTH
       aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
 #else
       aom_calc_psnr(orig, recon, &psnr);
 #endif
       adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
-                        &(cpi->psnr[0]));
-      cpi->total_sq_error[0] += psnr.sse[0];
-      cpi->total_samples[0] += psnr.samples[0];
+                        &(ppi->psnr[0]));
+      ppi->total_sq_error[0] += psnr.sse[0];
+      ppi->total_samples[0] += psnr.samples[0];
       samples = psnr.samples[0];
 
-      // TODO(yaowu): unify these two versions into one.
-      if (cm->seq_params.use_highbitdepth)
-        aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
-                             frame_ssim2);
-      else
-        aom_calc_ssim(orig, recon, &weight[0], &frame_ssim2[0]);
+      aom_calc_ssim(orig, recon, bit_depth, in_bit_depth,
+                    cm->seq_params->use_highbitdepth, weight, frame_ssim2);
 
-      cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2[0]);
-      cpi->summed_quality += frame_ssim2[0] * weight[0];
-      cpi->summed_weights += weight[0];
+      ppi->worst_ssim = AOMMIN(ppi->worst_ssim, frame_ssim2[0]);
+      ppi->summed_quality += frame_ssim2[0] * weight[0];
+      ppi->summed_weights += weight[0];
 
 #if CONFIG_AV1_HIGHBITDEPTH
       // Compute PSNR based on stream bit depth
       if ((cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) &&
           (in_bit_depth < bit_depth)) {
         adjust_image_stat(psnr.psnr_hbd[1], psnr.psnr_hbd[2], psnr.psnr_hbd[3],
-                          psnr.psnr_hbd[0], &cpi->psnr[1]);
-        cpi->total_sq_error[1] += psnr.sse_hbd[0];
-        cpi->total_samples[1] += psnr.samples_hbd[0];
+                          psnr.psnr_hbd[0], &ppi->psnr[1]);
+        ppi->total_sq_error[1] += psnr.sse_hbd[0];
+        ppi->total_samples[1] += psnr.samples_hbd[0];
 
-        cpi->worst_ssim_hbd = AOMMIN(cpi->worst_ssim_hbd, frame_ssim2[1]);
-        cpi->summed_quality_hbd += frame_ssim2[1] * weight[1];
-        cpi->summed_weights_hbd += weight[1];
+        ppi->worst_ssim_hbd = AOMMIN(ppi->worst_ssim_hbd, frame_ssim2[1]);
+        ppi->summed_quality_hbd += frame_ssim2[1] * weight[1];
+        ppi->summed_weights_hbd += weight[1];
       }
 #endif
 
@@ -3645,48 +3847,380 @@
       }
 #endif
     }
-    if (cpi->b_calculate_blockiness) {
-      if (!cm->seq_params.use_highbitdepth) {
+    if (ppi->b_calculate_blockiness) {
+      if (!cm->seq_params->use_highbitdepth) {
         const double frame_blockiness =
             av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
                                recon->y_stride, orig->y_width, orig->y_height);
-        cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
-        cpi->total_blockiness += frame_blockiness;
+        ppi->worst_blockiness = AOMMAX(ppi->worst_blockiness, frame_blockiness);
+        ppi->total_blockiness += frame_blockiness;
       }
 
-      if (cpi->b_calculate_consistency) {
-        if (!cm->seq_params.use_highbitdepth) {
+      if (ppi->b_calculate_consistency) {
+        if (!cm->seq_params->use_highbitdepth) {
           const double this_inconsistency = aom_get_ssim_metrics(
               orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
-              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+              orig->y_width, orig->y_height, ppi->ssim_vars, &ppi->metrics, 1);
 
           const double peak = (double)((1 << in_bit_depth) - 1);
           const double consistency =
-              aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+              aom_sse_to_psnr(samples, peak, ppi->total_inconsistency);
           if (consistency > 0.0)
-            cpi->worst_consistency =
-                AOMMIN(cpi->worst_consistency, consistency);
-          cpi->total_inconsistency += this_inconsistency;
+            ppi->worst_consistency =
+                AOMMIN(ppi->worst_consistency, consistency);
+          ppi->total_inconsistency += this_inconsistency;
         }
       }
     }
 
     frame_all =
         aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    adjust_image_stat(y, u, v, frame_all, &ppi->fastssim);
     frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
-    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+    adjust_image_stat(y, u, v, frame_all, &ppi->psnrhvs);
+  }
+}
+
+void print_internal_stats(AV1_PRIMARY *const ppi) {
+  if (!ppi->cpi) return;
+  AV1_COMP *const cpi = ppi->cpi;
+
+  if (ppi->cpi->oxcf.pass != 1 &&
+      ppi->cpi->common.current_frame.frame_number > 0) {
+    char headings[512] = { 0 };
+    char results[512] = { 0 };
+    FILE *f = fopen("opsnr.stt", "a");
+    double time_encoded =
+        (cpi->time_stamps.prev_ts_end - cpi->time_stamps.first_ts_start) /
+        10000000.000;
+    double total_encode_time =
+        (ppi->total_time_receive_data + ppi->total_time_compress_data) /
+        1000.000;
+    const double dr =
+        (double)ppi->total_bytes * (double)8 / (double)1000 / time_encoded;
+    const double peak =
+        (double)((1 << ppi->cpi->oxcf.input_cfg.input_bit_depth) - 1);
+    const double target_rate =
+        (double)ppi->cpi->oxcf.rc_cfg.target_bandwidth / 1000;
+    const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+    if (ppi->b_calculate_psnr) {
+      const double total_psnr = aom_sse_to_psnr(
+          (double)ppi->total_samples[0], peak, (double)ppi->total_sq_error[0]);
+      const double total_ssim =
+          100 * pow(ppi->summed_quality / ppi->summed_weights, 8.0);
+      snprintf(headings, sizeof(headings),
+               "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+               "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+               "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+               "AVPsrnY\tAPsnrCb\tAPsnrCr");
+      snprintf(results, sizeof(results),
+               "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+               "%7.3f\t%7.3f\t%7.3f",
+               dr, ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               ppi->psnr[0].stat[STAT_ALL] / ppi->count[0], total_psnr,
+               total_ssim, total_ssim,
+               ppi->fastssim.stat[STAT_ALL] / ppi->count[0],
+               ppi->psnrhvs.stat[STAT_ALL] / ppi->count[0], ppi->psnr[0].worst,
+               ppi->worst_ssim, ppi->fastssim.worst, ppi->psnrhvs.worst,
+               ppi->psnr[0].stat[STAT_Y] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_U] / ppi->count[0],
+               ppi->psnr[0].stat[STAT_V] / ppi->count[0]);
+
+      if (ppi->b_calculate_blockiness) {
+        SNPRINT(headings, "\t  Block\tWstBlck");
+        SNPRINT2(results, "\t%7.3f", ppi->total_blockiness / ppi->count[0]);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_blockiness);
+      }
+
+      if (ppi->b_calculate_consistency) {
+        double consistency =
+            aom_sse_to_psnr((double)ppi->total_samples[0], peak,
+                            (double)ppi->total_inconsistency);
+
+        SNPRINT(headings, "\tConsist\tWstCons");
+        SNPRINT2(results, "\t%7.3f", consistency);
+        SNPRINT2(results, "\t%7.3f", ppi->worst_consistency);
+      }
+
+      SNPRINT(headings, "\t   Time\tRcErr\tAbsErr");
+      SNPRINT2(results, "\t%8.0f", total_encode_time);
+      SNPRINT2(results, " %7.2f", rate_err);
+      SNPRINT2(results, " %7.2f", fabs(rate_err));
+
+      SNPRINT(headings, "\tAPsnr611");
+      SNPRINT2(results, " %7.3f",
+               (6 * ppi->psnr[0].stat[STAT_Y] + ppi->psnr[0].stat[STAT_U] +
+                ppi->psnr[0].stat[STAT_V]) /
+                   (ppi->count[0] * 8));
+
+#if CONFIG_AV1_HIGHBITDEPTH
+      const uint32_t in_bit_depth = ppi->cpi->oxcf.input_cfg.input_bit_depth;
+      const uint32_t bit_depth = ppi->seq_params.bit_depth;
+      // Since cpi->source->flags is not available here, but total_samples[1]
+      // will be non-zero if cpi->source->flags & YV12_FLAG_HIGHBITDEPTH was
+      // true in compute_internal_stats
+      if ((ppi->total_samples[1] > 0) && (in_bit_depth < bit_depth)) {
+        const double peak_hbd = (double)((1 << bit_depth) - 1);
+        const double total_psnr_hbd =
+            aom_sse_to_psnr((double)ppi->total_samples[1], peak_hbd,
+                            (double)ppi->total_sq_error[1]);
+        const double total_ssim_hbd =
+            100 * pow(ppi->summed_quality_hbd / ppi->summed_weights_hbd, 8.0);
+        SNPRINT(headings,
+                "\t AVGPsnrH GLBPsnrH AVPsnrPH GLPsnrPH"
+                " AVPsnrYH APsnrCbH APsnrCrH WstPsnrH"
+                " AOMSSIMH VPSSIMPH WstSsimH");
+        SNPRINT2(results, "\t%7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f",
+                 ppi->psnr[1].stat[STAT_ALL] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", total_psnr_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_Y] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_U] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].stat[STAT_V] / ppi->count[1]);
+        SNPRINT2(results, "  %7.3f", ppi->psnr[1].worst);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", total_ssim_hbd);
+        SNPRINT2(results, "  %7.3f", ppi->worst_ssim_hbd);
+      }
+#endif
+      fprintf(f, "%s\n", headings);
+      fprintf(f, "%s\n", results);
+    }
+
+    fclose(f);
+
+    if (ppi->ssim_vars != NULL) {
+      aom_free(ppi->ssim_vars);
+      ppi->ssim_vars = NULL;
+    }
   }
 }
 #endif  // CONFIG_INTERNAL_STATS
 
-int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
-                            const aom_rational64_t *timestamp_ratio) {
+static AOM_INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+  if (cpi->common.show_frame && cpi->rc.frames_to_key) {
+#if !CONFIG_REALTIME_ONLY
+    FIRSTPASS_INFO *firstpass_info = &cpi->ppi->twopass.firstpass_info;
+    if (firstpass_info->past_stats_count > FIRSTPASS_INFO_STATS_PAST_MIN) {
+      av1_firstpass_info_move_cur_index_and_pop(firstpass_info);
+    } else {
+      // When there is not enough past stats, we move the current
+      // index without popping the past stats
+      av1_firstpass_info_move_cur_index(firstpass_info);
+    }
+#endif
+    cpi->rc.frames_since_key++;
+    cpi->rc.frames_to_key--;
+    cpi->rc.frames_to_fwd_kf--;
+  }
+}
+
+static AOM_INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+  // TODO(weitinglin): Updating this counter for is_frame_droppable
+  // is a work-around to handle the condition when a frame is drop.
+  // We should fix the cpi->common.show_frame flag
+  // instead of checking the other condition to update the counter properly.
+  if (cpi->common.show_frame ||
+      is_frame_droppable(&cpi->svc, &cpi->ext_flags.refresh_frame)) {
+    // Decrement count down till next gf
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
+  }
+}
+
+static AOM_INLINE void update_gf_group_index(AV1_COMP *cpi) {
+  // Increment the gf group index ready for the next frame.
+  ++cpi->gf_frame_index;
+}
+
+static void update_fb_of_context_type(const AV1_COMP *const cpi,
+                                      int *const fb_of_context_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const int current_frame_ref_type = get_current_frame_ref_type(cpi);
+
+  if (frame_is_intra_only(cm) || cm->features.error_resilient_mode ||
+      cpi->ext_flags.use_primary_ref_none) {
+    for (int i = 0; i < REF_FRAMES; i++) {
+      fb_of_context_type[i] = -1;
+    }
+    fb_of_context_type[current_frame_ref_type] =
+        cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+                       : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+  }
+
+  if (!encode_show_existing_frame(cm)) {
+    // Refresh fb_of_context_type[]: see encoder.h for explanation
+    if (cm->current_frame.frame_type == KEY_FRAME) {
+      // All ref frames are refreshed, pick one that will live long enough
+      fb_of_context_type[current_frame_ref_type] = 0;
+    } else {
+      // If more than one frame is refreshed, it doesn't matter which one we
+      // pick so pick the first.  LST sometimes doesn't refresh any: this is ok
+
+      for (int i = 0; i < REF_FRAMES; i++) {
+        if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+          fb_of_context_type[current_frame_ref_type] = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+  update_keyframe_counters(cpi);
+  update_frames_till_gf_update(cpi);
+  update_gf_group_index(cpi);
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+static void update_end_of_frame_stats(AV1_COMP *cpi) {
+  if (cpi->do_frame_data_update) {
+    // Store current frame loopfilter levels in ppi, if update flag is set.
+    if (!cpi->common.show_existing_frame) {
+      AV1_COMMON *const cm = &cpi->common;
+      struct loopfilter *const lf = &cm->lf;
+      cpi->ppi->filter_level[0] = lf->filter_level[0];
+      cpi->ppi->filter_level[1] = lf->filter_level[1];
+      cpi->ppi->filter_level_u = lf->filter_level_u;
+      cpi->ppi->filter_level_v = lf->filter_level_v;
+    }
+  }
+
+  // Store frame level mv_stats from cpi to ppi.
+  cpi->ppi->mv_stats = cpi->mv_stats;
+}
+#endif
+
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data) {
+  AV1_PRIMARY *const ppi = cpi->ppi;
+  AV1_COMMON *const cm = &cpi->common;
+
+#if !CONFIG_REALTIME_ONLY
+  // Update the total stats remaining structure.
+  if (cpi->twopass_frame.this_frame != NULL &&
+      ppi->twopass.stats_buf_ctx->total_left_stats) {
+    subtract_stats(ppi->twopass.stats_buf_ctx->total_left_stats,
+                   cpi->twopass_frame.this_frame);
+  }
+#endif
+
+  if (!is_stat_generation_stage(cpi) && !cpi->is_dropped_frame) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // Before calling refresh_reference_frames(), copy ppi->ref_frame_map_copy
+    // to cm->ref_frame_map for frame_parallel_level 2 frame in a parallel
+    // encode set of lower layer frames.
+    // TODO(Remya): Move ref_frame_map from AV1_COMMON to AV1_PRIMARY to avoid
+    // copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 2 &&
+        ppi->gf_group.frame_parallel_level[cpi->gf_frame_index - 1] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index - 1] ==
+            INTNL_ARF_UPDATE) {
+      memcpy(cm->ref_frame_map, ppi->ref_frame_map_copy,
+             sizeof(cm->ref_frame_map));
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    refresh_reference_frames(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    // For frame_parallel_level 1 frame in a parallel encode set of lower layer
+    // frames, store the updated cm->ref_frame_map in ppi->ref_frame_map_copy.
+    if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1 &&
+        ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+      memcpy(ppi->ref_frame_map_copy, cm->ref_frame_map,
+             sizeof(cm->ref_frame_map));
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    av1_rc_postencode_update(cpi, cpi_data->frame_size);
+  }
+
+  if (cpi_data->pop_lookahead == 1) {
+    av1_lookahead_pop(cpi->ppi->lookahead, cpi_data->flush,
+                      cpi->compressor_stage);
+  }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (cpi->common.show_frame) {
+    cpi->ppi->ts_start_last_show_frame = cpi_data->ts_frame_start;
+    cpi->ppi->ts_end_last_show_frame = cpi_data->ts_frame_end;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  if (ppi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
+    // Initialize level info. at the beginning of each sequence.
+    if (cm->current_frame.frame_type == KEY_FRAME &&
+        ppi->gf_group.refbuf_state[cpi->gf_frame_index] == REFBUF_RESET) {
+      av1_init_level_info(cpi);
+    }
+    av1_update_level_info(cpi, cpi_data->frame_size, cpi_data->ts_frame_start,
+                          cpi_data->ts_frame_end);
+  }
+
+  if (!is_stat_generation_stage(cpi)) {
+#if !CONFIG_REALTIME_ONLY
+    if (!has_no_stats_stage(cpi)) av1_twopass_postencode_update(cpi);
+#endif
+    update_fb_of_context_type(cpi, ppi->fb_of_context_type);
+    update_rc_counts(cpi);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    update_end_of_frame_stats(cpi);
+#endif
+  }
+
+  if (cpi->oxcf.pass == AOM_RC_THIRD_PASS && cpi->third_pass_ctx) {
+    av1_pop_third_pass_info(cpi->third_pass_ctx);
+  }
+
+  if (ppi->use_svc) av1_save_layer_context(cpi);
+
+  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
+  if (ppi->b_calculate_psnr && cpi_data->frame_size > 0) {
+    if (cm->show_existing_frame ||
+        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
+      generate_psnr_packet(cpi);
+    }
+  }
+
+#if CONFIG_INTERNAL_STATS
+  if (!is_stat_generation_stage(cpi)) {
+    compute_internal_stats(cpi, (int)cpi_data->frame_size);
+  }
+#endif  // CONFIG_INTERNAL_STATS
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   AV1_COMMON *const cm = &cpi->common;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // The jmp_buf is valid only for the duration of the function that calls
+  // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+  // before it returns.
+  if (setjmp(cm->error->jmp)) {
+    cm->error->setjmp = 0;
+    return cm->error->error_code;
+  }
+  cm->error->setjmp = 1;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_INTERNAL_STATS
+  cpi->frame_recode_hits = 0;
+  cpi->time_compress_data = 0;
+  cpi->bytes = 0;
+#endif
+#if CONFIG_ENTROPY_STATS
+  if (cpi->compressor_stage == ENCODE_STAGE) {
+    av1_zero(cpi->counts);
+  }
+#endif
+
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads <= 1 &&
          "bitstream debug tool does not support multithreading");
@@ -3694,12 +4228,14 @@
   aom_bitstream_queue_set_frame_write(cm->current_frame.order_hint * 2 +
                                       cm->show_frame);
 #endif
-  if (cpi->use_svc && cm->number_spatial_layers > 1) {
+  if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) {
     av1_one_pass_cbr_svc_start_layer(cpi);
   }
 
+  cpi->is_dropped_frame = false;
   cm->showable_frame = 0;
-  *size = 0;
+  cpi_data->frame_size = 0;
+  cpi->available_bs_size = cpi_data->cx_data_sz;
 #if CONFIG_INTERNAL_STATS
   struct aom_usec_timer cmptimer;
   aom_usec_timer_start(&cmptimer);
@@ -3714,33 +4250,43 @@
   if (oxcf->tile_cfg.enable_large_scale_tile)
     cm->features.refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
 
-  // Initialize fields related to forward keyframes
-  cpi->no_show_fwd_kf = 0;
-
-  if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
+  if (assign_cur_frame_new_fb(cm) == NULL) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to allocate new cur_frame");
+#else
+    return AOM_CODEC_ERROR;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  }
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  // Only accumulate 2nd pass time.
-  if (cpi->oxcf.pass == 2) start_timing(cpi, av1_encode_strategy_time);
+  // Accumulate 2nd pass time in 2-pass case or 1 pass time in 1-pass case.
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    start_timing(cpi, av1_encode_strategy_time);
 #endif
 
-  const int result =
-      av1_encode_strategy(cpi, size, dest, frame_flags, time_stamp, time_end,
-                          timestamp_ratio, flush);
+  const int result = av1_encode_strategy(
+      cpi, &cpi_data->frame_size, cpi_data->cx_data, &cpi_data->lib_flags,
+      &cpi_data->ts_frame_start, &cpi_data->ts_frame_end,
+      cpi_data->timestamp_ratio, &cpi_data->pop_lookahead, cpi_data->flush);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
-  if (cpi->oxcf.pass == 2) end_timing(cpi, av1_encode_strategy_time);
+  if (cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0)
+    end_timing(cpi, av1_encode_strategy_time);
 
   // Print out timing information.
   // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
   // show_existing_frame and lag-in-frames.
-  if (cpi->oxcf.pass == 2 && cpi->frame_component_time[0] > 100) {
+  if ((cpi->oxcf.pass == 2 || cpi->oxcf.pass == 0) &&
+      cpi->frame_component_time[0] > 100) {
     int i;
     uint64_t frame_total = 0, total = 0;
 
-    fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
             cm->current_frame.frame_number,
-            get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
+            get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame,
+            cm->quant_params.base_qindex);
     for (i = 0; i < kTimingComponents; i++) {
       cpi->component_time[i] += cpi->frame_component_time[i];
       // Use av1_encode_strategy_time (i = 0) as the total time.
@@ -3762,37 +4308,25 @@
 #endif
 
   if (result == -1) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    cm->error->setjmp = 0;
+#endif
     // Returning -1 indicates no frame encoded; more input is required
     return -1;
   }
   if (result != AOM_CODEC_OK) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    aom_internal_error(cpi->common.error, AOM_CODEC_ERROR,
+                       "Failed to encode frame");
+#else
     return AOM_CODEC_ERROR;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 #if CONFIG_INTERNAL_STATS
   aom_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
 #endif  // CONFIG_INTERNAL_STATS
-  // Note *size = 0 indicates a dropped frame for which psnr is not calculated
-  if (cpi->b_calculate_psnr && *size > 0) {
-    if (cm->show_existing_frame ||
-        (!is_stat_generation_stage(cpi) && cm->show_frame)) {
-      generate_psnr_packet(cpi);
-    }
-  }
 
-  if (cpi->level_params.keep_level_stats && !is_stat_generation_stage(cpi)) {
-    // Initialize level info. at the beginning of each sequence.
-    if (cm->current_frame.frame_type == KEY_FRAME && !cpi->no_show_fwd_kf) {
-      av1_init_level_info(cpi);
-    }
-    av1_update_level_info(cpi, *size, *time_stamp, *time_end);
-  }
-
-#if CONFIG_INTERNAL_STATS
-  if (!is_stat_generation_stage(cpi)) {
-    compute_internal_stats(cpi, (int)(*size));
-  }
-#endif  // CONFIG_INTERNAL_STATS
 #if CONFIG_SPEED_STATS
   if (!is_stat_generation_stage(cpi) && !cm->show_existing_frame) {
     cpi->tx_search_count += cpi->td.mb.txfm_search_info.tx_search_count;
@@ -3800,11 +4334,319 @@
   }
 #endif  // CONFIG_SPEED_STATS
 
-  aom_clear_system_state();
-
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  cm->error->setjmp = 0;
+#endif
   return AOM_CODEC_OK;
 }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// Populates cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set. Also sets the bitmask 'ref_buffers_used_map'.
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map) {
+  AV1_COMMON *cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+    if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+      const YV12_BUFFER_CONFIG *const ref =
+          get_ref_frame_yv12_buf(cm, ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+        continue;
+      }
+
+      // FPMT does not support scaling yet.
+      assert(ref->y_crop_width == cm->width &&
+             ref->y_crop_height == cm->height);
+
+      RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
+      cpi->scaled_ref_buf[ref_frame - 1] = buf;
+      for (int i = 0; i < FRAME_BUFFERS; ++i) {
+        if (&cm->buffer_pool->frame_bufs[i] == buf) {
+          *ref_buffers_used_map |= (1 << i);
+        }
+      }
+    } else {
+      if (!has_no_stats_stage(cpi)) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
+    }
+  }
+}
+
+// Increments the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map) {
+  for (int i = 0; i < FRAME_BUFFERS; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      ++buffer_pool->frame_bufs[i].ref_count;
+    }
+  }
+}
+
+// Releases cpi->scaled_ref_buf corresponding to frames in a parallel encode
+// set.
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi) {
+  // TODO(isbs): only refresh the necessary frames, rather than all of them
+  for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+    RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+    if (buf != NULL) {
+      cpi->scaled_ref_buf[i] = NULL;
+    }
+  }
+}
+
+// Decrements the ref_count of frame buffers referenced by cpi->scaled_ref_buf
+// corresponding to frames in a parallel encode set.
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map) {
+  for (int i = 0; i < FRAME_BUFFERS; ++i) {
+    if (ref_buffers_used_map & (1 << i)) {
+      --buffer_pool->frame_bufs[i].ref_count;
+    }
+  }
+}
+
+// Initialize parallel frame contexts with screen content decisions.
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  for (int i = 1; i < ppi->num_fp_contexts; ++i) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[i];
+    cur_cpi->common.features.allow_screen_content_tools =
+        first_cpi->common.features.allow_screen_content_tools;
+    cur_cpi->common.features.allow_intrabc =
+        first_cpi->common.features.allow_intrabc;
+    cur_cpi->use_screen_content_tools = first_cpi->use_screen_content_tools;
+    cur_cpi->is_screen_content_type = first_cpi->is_screen_content_type;
+  }
+}
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data) {
+  int cpi_idx = 0;
+
+  // Loop over parallel_cpi to find the cpi that processed the current
+  // gf_frame_index ahead of time.
+  for (int i = 1; i < ppi->num_fp_contexts; i++) {
+    if (ppi->cpi->gf_frame_index == ppi->parallel_cpi[i]->gf_frame_index) {
+      cpi_idx = i;
+      break;
+    }
+  }
+
+  assert(cpi_idx > 0);
+  assert(!ppi->parallel_cpi[cpi_idx]->common.show_existing_frame);
+
+  // Release the previously-used frame-buffer.
+  if (ppi->cpi->common.cur_frame != NULL) {
+    --ppi->cpi->common.cur_frame->ref_count;
+    ppi->cpi->common.cur_frame = NULL;
+  }
+
+  // Swap the appropriate parallel_cpi with the parallel_cpi[0].
+  ppi->cpi = ppi->parallel_cpi[cpi_idx];
+  ppi->parallel_cpi[cpi_idx] = ppi->parallel_cpi[0];
+  ppi->parallel_cpi[0] = ppi->cpi;
+
+  // Copy appropriate parallel_frames_data to local data.
+  {
+    AV1_COMP_DATA *data = &ppi->parallel_frames_data[cpi_idx - 1];
+    assert(data->frame_size > 0);
+    assert(first_cpi_data->cx_data_sz > data->frame_size);
+
+    first_cpi_data->lib_flags = data->lib_flags;
+    first_cpi_data->ts_frame_start = data->ts_frame_start;
+    first_cpi_data->ts_frame_end = data->ts_frame_end;
+    memcpy(first_cpi_data->cx_data, data->cx_data, data->frame_size);
+    first_cpi_data->frame_size = data->frame_size;
+    if (ppi->cpi->common.show_frame) {
+      first_cpi_data->pop_lookahead = 1;
+    }
+  }
+
+  return ppi->cpi;
+}
+
+// Initialises frames belonging to a parallel encode set.
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map) {
+  AV1_COMP *const first_cpi = ppi->cpi;
+  GF_GROUP *const gf_group = &ppi->gf_group;
+  int gf_index_start = first_cpi->gf_frame_index;
+  assert(gf_group->frame_parallel_level[gf_index_start] == 1);
+  int parallel_frame_count = 0;
+  int cur_frame_num = first_cpi->common.current_frame.frame_number;
+  int show_frame_count = first_cpi->frame_index_set.show_frame_count;
+  int frames_since_key = first_cpi->rc.frames_since_key;
+  int frames_to_key = first_cpi->rc.frames_to_key;
+  int frames_to_fwd_kf = first_cpi->rc.frames_to_fwd_kf;
+  int cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[gf_index_start];
+  const FIRSTPASS_STATS *stats_in = first_cpi->twopass_frame.stats_in;
+
+  assert(*ref_buffers_used_map == 0);
+
+  // Release the previously used frame-buffer by a frame_parallel_level 1 frame.
+  if (first_cpi->common.cur_frame != NULL) {
+    --first_cpi->common.cur_frame->ref_count;
+    first_cpi->common.cur_frame = NULL;
+  }
+
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  RefFrameMapPair first_ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(first_cpi, first_ref_frame_map_pairs);
+  memcpy(ref_frame_map_pairs, first_ref_frame_map_pairs,
+         sizeof(RefFrameMapPair) * REF_FRAMES);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Store the reference refresh index of frame_parallel_level 1 frame in a
+  // parallel encode set of lower layer frames.
+  if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+    first_cpi->ref_refresh_index = av1_calc_refresh_idx_for_intnl_arf(
+        first_cpi, ref_frame_map_pairs, gf_index_start);
+    assert(first_cpi->ref_refresh_index != INVALID_IDX &&
+           first_cpi->ref_refresh_index < REF_FRAMES);
+    first_cpi->refresh_idx_available = true;
+    // Update ref_frame_map_pairs.
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].disp_order =
+        gf_group->display_idx[gf_index_start];
+    ref_frame_map_pairs[first_cpi->ref_refresh_index].pyr_level =
+        gf_group->layer_depth[gf_index_start];
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+
+  // Set do_frame_data_update flag as false for frame_parallel_level 1 frame.
+  first_cpi->do_frame_data_update = false;
+  if (gf_group->arf_src_offset[gf_index_start] == 0) {
+    first_cpi->time_stamps.prev_ts_start = ppi->ts_start_last_show_frame;
+    first_cpi->time_stamps.prev_ts_end = ppi->ts_end_last_show_frame;
+  }
+
+  av1_get_ref_frames(NULL, first_ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                     first_cpi, gf_index_start, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                     first_cpi->common.remapped_ref_idx);
+
+  av1_scale_references_fpmt(first_cpi, ref_buffers_used_map);
+  parallel_frame_count++;
+
+  // Iterate through the GF_GROUP to find the remaining frame_parallel_level 2
+  // frames which are part of the current parallel encode set and initialize the
+  // required cpi elements.
+  for (int i = gf_index_start + 1; i < gf_group->size; i++) {
+    // Update frame counters if previous frame was show frame or show existing
+    // frame.
+    if (gf_group->arf_src_offset[i - 1] == 0) {
+      cur_frame_num++;
+      show_frame_count++;
+      if (frames_to_fwd_kf <= 0)
+        frames_to_fwd_kf = first_cpi->oxcf.kf_cfg.fwd_kf_dist;
+      if (frames_to_key) {
+        frames_since_key++;
+        frames_to_key--;
+        frames_to_fwd_kf--;
+      }
+      stats_in++;
+    }
+    cur_frame_disp = cur_frame_num + gf_group->arf_src_offset[i];
+    if (gf_group->frame_parallel_level[i] == 2) {
+      AV1_COMP *cur_cpi = ppi->parallel_cpi[parallel_frame_count];
+      AV1_COMP_DATA *cur_cpi_data =
+          &ppi->parallel_frames_data[parallel_frame_count - 1];
+      cur_cpi->gf_frame_index = i;
+      cur_cpi->framerate = first_cpi->framerate;
+      cur_cpi->common.current_frame.frame_number = cur_frame_num;
+      cur_cpi->frame_index_set.show_frame_count = show_frame_count;
+      cur_cpi->rc.frames_since_key = frames_since_key;
+      cur_cpi->rc.frames_to_key = frames_to_key;
+      cur_cpi->rc.frames_to_fwd_kf = frames_to_fwd_kf;
+      cur_cpi->rc.active_worst_quality = first_cpi->rc.active_worst_quality;
+      cur_cpi->rc.avg_frame_bandwidth = first_cpi->rc.avg_frame_bandwidth;
+      cur_cpi->rc.max_frame_bandwidth = first_cpi->rc.max_frame_bandwidth;
+      cur_cpi->rc.min_frame_bandwidth = first_cpi->rc.min_frame_bandwidth;
+      cur_cpi->rc.intervals_till_gf_calculate_due =
+          first_cpi->rc.intervals_till_gf_calculate_due;
+      cur_cpi->mv_search_params.max_mv_magnitude =
+          first_cpi->mv_search_params.max_mv_magnitude;
+      if (gf_group->update_type[cur_cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
+        cur_cpi->common.lf.mode_ref_delta_enabled = 1;
+      }
+      cur_cpi->do_frame_data_update = false;
+      // Initialize prev_ts_start and prev_ts_end for show frame(s) and show
+      // existing frame(s).
+      if (gf_group->arf_src_offset[i] == 0) {
+        // Choose source of prev frame.
+        int src_index = gf_group->src_offset[i];
+        struct lookahead_entry *prev_source = av1_lookahead_peek(
+            ppi->lookahead, src_index - 1, cur_cpi->compressor_stage);
+        // Save timestamps of prev frame.
+        cur_cpi->time_stamps.prev_ts_start = prev_source->ts_start;
+        cur_cpi->time_stamps.prev_ts_end = prev_source->ts_end;
+      }
+      cur_cpi->time_stamps.first_ts_start =
+          first_cpi->time_stamps.first_ts_start;
+
+      memcpy(cur_cpi->common.ref_frame_map, first_cpi->common.ref_frame_map,
+             sizeof(first_cpi->common.ref_frame_map));
+      cur_cpi_data->lib_flags = 0;
+      cur_cpi_data->timestamp_ratio = first_cpi_data->timestamp_ratio;
+      cur_cpi_data->flush = first_cpi_data->flush;
+      cur_cpi_data->frame_size = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      if (gf_group->update_type[gf_index_start] == INTNL_ARF_UPDATE) {
+        // If the first frame in a parallel encode set is INTNL_ARF_UPDATE
+        // frame, initialize lib_flags of frame_parallel_level 2 frame in the
+        // set with that of frame_parallel_level 1 frame.
+        cur_cpi_data->lib_flags = first_cpi_data->lib_flags;
+        // Store the reference refresh index of frame_parallel_level 2 frame in
+        // a parallel encode set of lower layer frames.
+        cur_cpi->ref_refresh_index =
+            av1_calc_refresh_idx_for_intnl_arf(cur_cpi, ref_frame_map_pairs, i);
+        cur_cpi->refresh_idx_available = true;
+        // Skip the reference frame which will be refreshed by
+        // frame_parallel_level 1 frame in a parallel encode set of lower layer
+        // frames.
+        cur_cpi->ref_idx_to_skip = first_cpi->ref_refresh_index;
+      } else {
+        cur_cpi->ref_idx_to_skip = INVALID_IDX;
+        cur_cpi->ref_refresh_index = INVALID_IDX;
+        cur_cpi->refresh_idx_available = false;
+      }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+      cur_cpi->twopass_frame.stats_in = stats_in;
+
+      av1_get_ref_frames(NULL, first_ref_frame_map_pairs, cur_frame_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                         cur_cpi, i, 1,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+                         cur_cpi->common.remapped_ref_idx);
+      av1_scale_references_fpmt(cur_cpi, ref_buffers_used_map);
+      parallel_frame_count++;
+    }
+
+    // Set do_frame_data_update to true for the last frame_parallel_level 2
+    // frame in the current parallel encode set.
+    if (i == (gf_group->size - 1) ||
+        (gf_group->frame_parallel_level[i + 1] == 0 &&
+         (gf_group->update_type[i + 1] == ARF_UPDATE ||
+          gf_group->update_type[i + 1] == INTNL_ARF_UPDATE)) ||
+        gf_group->frame_parallel_level[i + 1] == 1) {
+      ppi->parallel_cpi[parallel_frame_count - 1]->do_frame_data_update = true;
+      break;
+    }
+  }
+
+  av1_increment_scaled_ref_counts_fpmt(first_cpi->common.buffer_pool,
+                                       *ref_buffers_used_map);
+
+  // Return the number of frames in the parallel encode set.
+  return parallel_frame_count;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
   AV1_COMMON *cm = &cpi->common;
   if (!cm->show_frame) {
@@ -3815,13 +4657,12 @@
       *dest = cm->cur_frame->buf;
       dest->y_width = cm->width;
       dest->y_height = cm->height;
-      dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
-      dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+      dest->uv_width = cm->width >> cm->seq_params->subsampling_x;
+      dest->uv_height = cm->height >> cm->seq_params->subsampling_y;
       ret = 0;
     } else {
       ret = -1;
     }
-    aom_clear_system_state();
     return ret;
   }
 }
@@ -3838,12 +4679,12 @@
                                        YV12_BUFFER_CONFIG *sd) {
   const int num_planes = av1_num_planes(cm);
   if (!equal_dimensions_and_border(new_frame, sd))
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   else
     aom_yv12_copy_frame(new_frame, sd, num_planes);
 
-  return cm->error.error_code;
+  return cm->error->error_code;
 }
 
 int av1_set_internal_size(AV1EncoderConfig *const oxcf,
@@ -4038,12 +4879,12 @@
   }
 }
 
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
-  if (!cpi) return NULL;
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi) {
+  if (!ppi) return NULL;
 
   uint8_t header_buf[512] = { 0 };
   const uint32_t sequence_header_size =
-      av1_write_sequence_header_obu(&cpi->common.seq_params, &header_buf[0]);
+      av1_write_sequence_header_obu(&ppi->seq_params, &header_buf[0]);
   assert(sequence_header_size <= sizeof(header_buf));
   if (sequence_header_size == 0) return NULL;
 
@@ -4054,7 +4895,8 @@
   if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
   memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
 
-  if (av1_write_obu_header(&cpi->level_params, OBU_SEQUENCE_HEADER, 0,
+  if (av1_write_obu_header(&ppi->level_params, &ppi->cpi->frame_header_count,
+                           OBU_SEQUENCE_HEADER, 0,
                            &header_buf[0]) != obu_header_size) {
     return NULL;
   }

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index 994f691..83c8bd1 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -30,11 +30,13 @@
 #include "av1/common/resize.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/timing.h"
+
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
+#include "av1/encoder/external_partition.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/global_motion.h"
 #include "av1/encoder/level.h"
@@ -46,9 +48,11 @@
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/svc_layercontext.h"
 #include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/thirdpass.h"
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/tpl_model.h"
 #include "av1/encoder/av1_noise_estimate.h"
+#include "av1/encoder/bitstream.h"
 
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
@@ -80,6 +84,17 @@
 // Number of frames required to test for scene cut detection
 #define SCENE_CUT_KEY_TEST_INTERVAL 16
 
+// Lookahead index threshold to enable temporal filtering for second arf.
+#define TF_LOOKAHEAD_IDX_THR 7
+
+#define HDR_QP_LEVELS 10
+#define CHROMA_CB_QP_SCALE 1.04
+#define CHROMA_CR_QP_SCALE 1.04
+#define CHROMA_QP_SCALE -0.46
+#define CHROMA_QP_OFFSET 9.26
+#define QP_SCALE_FACTOR 2.0
+#define DISABLE_HDR_LUMA_DELTAQ 1
+
 // Rational number with an int64 numerator
 // This structure holds a fractional value
 typedef struct aom_rational64 {
@@ -119,6 +134,26 @@
   FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
 } UENUM1BYTE(FRAMETYPE_FLAGS);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// 0 level frames are sometimes used for rate control purposes, but for
+// reference mapping purposes, the minimum level should be 1.
+#define MIN_PYR_LEVEL 1
+static INLINE int get_true_pyr_level(int frame_level, int frame_order,
+                                     int max_layer_depth) {
+  if (frame_order == 0) {
+    // Keyframe case
+    return MIN_PYR_LEVEL;
+  } else if (frame_level == MAX_ARF_LAYERS) {
+    // Leaves
+    return max_layer_depth;
+  } else if (frame_level == (MAX_ARF_LAYERS + 1)) {
+    // Altrefs
+    return MIN_PYR_LEVEL;
+  }
+  return AOMMAX(MIN_PYR_LEVEL, frame_level);
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 enum {
   NO_AQ = 0,
   VARIANCE_AQ = 1,
@@ -128,9 +163,12 @@
 } UENUM1BYTE(AQ_MODE);
 enum {
   NO_DELTA_Q = 0,
-  DELTA_Q_OBJECTIVE = 1,   // Modulation to improve objective quality
-  DELTA_Q_PERCEPTUAL = 2,  // Modulation to improve perceptual quality
-  DELTA_Q_MODE_COUNT       // This should always be the last member of the enum
+  DELTA_Q_OBJECTIVE = 1,      // Modulation to improve objective quality
+  DELTA_Q_PERCEPTUAL = 2,     // Modulation to improve video perceptual quality
+  DELTA_Q_PERCEPTUAL_AI = 3,  // Perceptual quality opt for all intra mode
+  DELTA_Q_USER_RATING_BASED = 4,  // User rating based delta q mode
+  DELTA_Q_HDR = 5,    // QP adjustment based on HDR block pixel average
+  DELTA_Q_MODE_COUNT  // This should always be the last member of the enum
 } UENUM1BYTE(DELTAQ_MODE);
 
 enum {
@@ -166,7 +204,10 @@
   MOD_ENC,          // Encode stage
   MOD_LPF,          // Deblocking loop filter
   MOD_CDEF_SEARCH,  // CDEF search
+  MOD_CDEF,         // CDEF frame
   MOD_LR,           // Loop restoration filtering
+  MOD_PACK_BS,      // Pack bitstream
+  MOD_FRAME_ENC,    // Frame Parallel encode
   NUM_MT_MODULES
 } MULTI_THREADED_MODULES;
 
@@ -256,7 +297,12 @@
    */
   bool enable_cfl_intra;
   /*!
-   * Flag to indicate if D45 to D203 intra prediction modes should be enabled.
+   * Flag to indicate if directional modes should be enabled.
+   */
+  bool enable_directional_intra;
+  /*!
+   * Flag to indicate if the subset of directional modes from D45 to D203 intra
+   * should be enabled. Has no effect if directional modes are disabled.
    */
   bool enable_diagonal_intra;
   /*!
@@ -302,6 +348,10 @@
    * (mode-dependent) only.
    */
   bool use_intra_default_tx_only;
+  /*!
+   * Flag to indicate if transform size search should be enabled.
+   */
+  bool enable_tx_size_search;
 } TxfmSizeTypeCfg;
 
 /*!
@@ -409,6 +459,11 @@
   bool auto_key;
 
   /*!
+   * Indicates the forward key frame distance.
+   */
+  int fwd_kf_dist;
+
+  /*!
    * Indicates if forward keyframe reference should be enabled.
    */
   bool fwd_kf_enabled;
@@ -706,6 +761,8 @@
   DELTAQ_MODE deltaq_mode;
   // Indicates if delta quantization should be enabled in chroma planes.
   bool enable_chroma_deltaq;
+  // Indicates if delta quantization should be enabled for hdr video
+  bool enable_hdr_deltaq;
   // Indicates if encoding with quantization matrices should be enabled.
   bool using_qm;
 } QuantizationCfg;
@@ -716,7 +773,10 @@
  */
 typedef struct {
   /*!
-   * Indicates the loop filter sharpness.
+   * Controls the level at which rate-distortion optimization of transform
+   * coefficients favours sharpness in the block. Has no impact on RD when set
+   * to zero (default). For values 1-7, eob and skip block optimization are
+   * avoided and rdmult is adjusted in favour of block sharpness.
    */
   int sharpness;
 
@@ -922,14 +982,20 @@
   /*!\endcond */
   /*!
    * Indicates the current encoder pass :
-   * 0 = 1 Pass encode,
-   * 1 = First pass of two pass,
-   * 2 = Second pass of two pass.
-   *
+   * AOM_RC_ONE_PASS = One pass encode,
+   * AOM_RC_FIRST_PASS = First pass of multiple-pass
+   * AOM_RC_SECOND_PASS = Second pass of multiple-pass
+   * AOM_RC_THIRD_PASS = Third pass of multiple-pass
    */
   enum aom_enc_pass pass;
   /*!\cond */
 
+  // Total number of encoding passes.
+  int passes;
+
+  // the name of the second pass output file when passes > 2
+  const char *two_pass_output;
+
   // Indicates if the encoding is GOOD or REALTIME.
   MODE mode;
 
@@ -945,6 +1011,9 @@
   // format.
   bool save_as_annexb;
 
+  // The path for partition stats reading and writing, used in the experiment
+  // CONFIG_PARTITION_SEARCH_ORDER.
+  const char *partition_info_path;
   /*!\endcond */
 } AV1EncoderConfig;
 
@@ -1272,6 +1341,7 @@
   TileInfo tile_info;
   DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
   FRAME_CONTEXT *row_ctx;
+  uint64_t abs_sum_level;
   uint8_t allow_update_cdf;
   InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
   AV1EncRowMultiThreadSync row_mt_sync;
@@ -1294,21 +1364,28 @@
   PC_TREE_SHARED_BUFFERS shared_coeff_buf;
   SIMPLE_MOTION_DATA_TREE *sms_tree;
   SIMPLE_MOTION_DATA_TREE *sms_root;
-  InterModesInfo *inter_modes_info;
   uint32_t *hash_value_buffer[2][2];
   OBMCBuffer obmc_buffer;
   PALETTE_BUFFER *palette_buffer;
   CompoundTypeRdBuffers comp_rd_buffer;
   CONV_BUF_TYPE *tmp_conv_dst;
+  uint64_t abs_sum_level;
   uint8_t *tmp_pred_bufs[2];
   int intrabc_used;
   int deltaq_used;
+  int coefficient_size;
+  int max_mv_magnitude;
+  int interp_filter_selected[SWITCHABLE];
   FRAME_CONTEXT *tctx;
   VP64x64 *vt64x64;
   int32_t num_64x64_blocks;
   PICK_MODE_CONTEXT *firstpass_ctx;
   TemporalFilterData tf_data;
   TplTxfmStats tpl_txfm_stats;
+  // Pointer to the array of structures to store gradient information of each
+  // pixel in a superblock. The buffer constitutes of MAX_SB_SQUARE pixel level
+  // structures for each of the plane types (PLANE_TYPE_Y and PLANE_TYPE_UV).
+  PixelLevelGradientInfo *pixel_gradient_info;
 } ThreadData;
 
 struct EncWorkerData;
@@ -1369,6 +1446,87 @@
   /**@}*/
 } AV1EncRowMultiThreadInfo;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+/*!
+ * \brief Max number of frames that can be encoded in a parallel encode set.
+ */
+#define MAX_PARALLEL_FRAMES 4
+/*!
+ * \brief Max number of recodes used to track the frame probabilities.
+ */
+#define NUM_RECODES_PER_FRAME 10
+
+/*!
+ * \brief Buffers to be backed up during parallel encode set to be restored
+ * later.
+ */
+typedef struct RestoreStateBuffers {
+  /*!
+   * Backup of original CDEF srcbuf.
+   */
+  uint16_t *cdef_srcbuf;
+
+  /*!
+   * Backup of original CDEF colbuf.
+   */
+  uint16_t *cdef_colbuf[MAX_MB_PLANE];
+
+  /*!
+   * Backup of original LR rst_tmpbuf.
+   */
+  int32_t *rst_tmpbuf;
+
+  /*!
+   * Backup of original LR rlbs.
+   */
+  RestorationLineBuffers *rlbs;
+} RestoreStateBuffers;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+/*!
+ * \brief Primary Encoder parameters related to multi-threading.
+ */
+typedef struct PrimaryMultiThreadInfo {
+  /*!
+   * Number of workers created for multi-threading.
+   */
+  int num_workers;
+
+  /*!
+   * Number of workers used for different MT modules.
+   */
+  int num_mod_workers[NUM_MT_MODULES];
+
+  /*!
+   * Synchronization object used to launch job in the worker thread.
+   */
+  AVxWorker *workers;
+
+  /*!
+   * Data specific to each worker in encoder multi-threading.
+   * tile_thr_data[i] stores the worker data of the ith thread.
+   */
+  struct EncWorkerData *tile_thr_data;
+
+  /*!
+   * CDEF row multi-threading data.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Primary(Level 1) Synchronization object used to launch job in the worker
+   * thread.
+   */
+  AVxWorker *p_workers[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Number of primary workers created for multi-threading.
+   */
+  int p_num_workers;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+} PrimaryMultiThreadInfo;
+
 /*!
  * \brief Encoder parameters related to multi-threading.
  */
@@ -1384,18 +1542,6 @@
   int num_mod_workers[NUM_MT_MODULES];
 
   /*!
-   * Flag to indicate whether thread specific buffers need to be allocated for
-   * tile/row based multi-threading of first pass stage.
-   */
-  int fp_mt_buf_init_done;
-
-  /*!
-   * Flag to indicate whether thread specific buffers need to be allocated for
-   * tile/row based multi-threading of encode stage.
-   */
-  int enc_mt_buf_init_done;
-
-  /*!
    * Synchronization object used to launch job in the worker thread.
    */
   AVxWorker *workers;
@@ -1433,6 +1579,11 @@
   AV1LrSync lr_row_sync;
 
   /*!
+   * Pack bitstream multi-threading object.
+   */
+  AV1EncPackBSSync pack_bs_sync;
+
+  /*!
    * Global Motion multi-threading object.
    */
   AV1GlobalMotionSync gm_sync;
@@ -1446,6 +1597,18 @@
    * CDEF search multi-threading object.
    */
   AV1CdefSync cdef_sync;
+
+  /*!
+   * Pointer to CDEF row multi-threading data for the frame.
+   */
+  AV1CdefWorkerData *cdef_worker;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Buffers to be stored/restored before/after parallel encode.
+   */
+  RestoreStateBuffers restore_state_buf;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 } MultiThreadInfo;
 
 /*!\cond */
@@ -1540,12 +1703,13 @@
 // Adjust the following to add new components.
 enum {
   av1_encode_strategy_time,
+  av1_get_one_pass_rt_params_time,
   av1_get_second_pass_params_time,
   denoise_and_encode_time,
   apply_filtering_time,
   av1_tpl_setup_stats_time,
   encode_frame_to_data_rate_time,
-  encode_with_recode_loop_time,
+  encode_with_or_without_recode_time,
   loop_filter_time,
   cdef_time,
   loop_restoration_time,
@@ -1556,6 +1720,7 @@
   encode_sb_row_time,
 
   rd_pick_partition_time,
+  rd_use_partition_time,
   av1_prune_partitions_time,
   none_partition_search_time,
   split_partition_search_time,
@@ -1567,10 +1732,13 @@
   rd_pick_sb_modes_time,
   av1_rd_pick_intra_mode_sb_time,
   av1_rd_pick_inter_mode_sb_time,
+  set_params_rd_pick_inter_mode_time,
+  skip_inter_mode_time,
   handle_inter_mode_time,
   evaluate_motion_mode_for_winner_candidates_time,
-  handle_intra_mode_time,
   do_tx_search_time,
+  handle_intra_mode_time,
+  refine_winner_mode_tx_time,
   av1_search_palette_mode_time,
   handle_newmv_time,
   compound_type_rd_time,
@@ -1582,6 +1750,8 @@
 static INLINE char const *get_component_name(int index) {
   switch (index) {
     case av1_encode_strategy_time: return "av1_encode_strategy_time";
+    case av1_get_one_pass_rt_params_time:
+      return "av1_get_one_pass_rt_params_time";
     case av1_get_second_pass_params_time:
       return "av1_get_second_pass_params_time";
     case denoise_and_encode_time: return "denoise_and_encode_time";
@@ -1589,7 +1759,8 @@
     case av1_tpl_setup_stats_time: return "av1_tpl_setup_stats_time";
     case encode_frame_to_data_rate_time:
       return "encode_frame_to_data_rate_time";
-    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case encode_with_or_without_recode_time:
+      return "encode_with_or_without_recode_time";
     case loop_filter_time: return "loop_filter_time";
     case cdef_time: return "cdef_time";
     case loop_restoration_time: return "loop_restoration_time";
@@ -1601,6 +1772,7 @@
     case encode_sb_row_time: return "encode_sb_row_time";
 
     case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_use_partition_time: return "rd_use_partition_time";
     case av1_prune_partitions_time: return "av1_prune_partitions_time";
     case none_partition_search_time: return "none_partition_search_time";
     case split_partition_search_time: return "split_partition_search_time";
@@ -1615,11 +1787,15 @@
       return "av1_rd_pick_intra_mode_sb_time";
     case av1_rd_pick_inter_mode_sb_time:
       return "av1_rd_pick_inter_mode_sb_time";
+    case set_params_rd_pick_inter_mode_time:
+      return "set_params_rd_pick_inter_mode_time";
+    case skip_inter_mode_time: return "skip_inter_mode_time";
     case handle_inter_mode_time: return "handle_inter_mode_time";
     case evaluate_motion_mode_for_winner_candidates_time:
       return "evaluate_motion_mode_for_winner_candidates_time";
-    case handle_intra_mode_time: return "handle_intra_mode_time";
     case do_tx_search_time: return "do_tx_search_time";
+    case handle_intra_mode_time: return "handle_intra_mode_time";
+    case refine_winner_mode_tx_time: return "refine_winner_mode_tx_time";
     case av1_search_palette_mode_time: return "av1_search_palette_mode_time";
     case handle_newmv_time: return "handle_newmv_time";
     case compound_type_rd_time: return "compound_type_rd_time";
@@ -1768,7 +1944,7 @@
   bool golden_frame;  /*!< Refresh flag for golden frame */
   bool bwd_ref_frame; /*!< Refresh flag for bwd-ref frame */
   bool alt_ref_frame; /*!< Refresh flag for alt-ref frame */
-} RefreshFrameFlagsInfo;
+} RefreshFrameInfo;
 
 /*!
  * \brief Desired dimensions for an externally triggered resize.
@@ -1966,6 +2142,17 @@
   int valid;
 } MV_STATS;
 
+typedef struct WeberStats {
+  int64_t mb_wiener_variance;
+  int64_t src_variance;
+  int64_t rec_variance;
+  int16_t src_pix_max;
+  int16_t rec_pix_max;
+  int64_t distortion;
+  int64_t satd;
+  double max_scale;
+} WeberStats;
+
 typedef struct {
   struct loopfilter lf;
   CdefInfo cdef_info;
@@ -2052,11 +2239,119 @@
 } CoeffBufferPool;
 
 /*!
+ * \brief Structure to hold data corresponding to an encoded frame.
+ */
+typedef struct AV1_COMP_DATA {
+  /*!
+   * Buffer to store packed bitstream data of a frame.
+   */
+  unsigned char *cx_data;
+
+  /*!
+   * Allocated size of the cx_data buffer.
+   */
+  size_t cx_data_sz;
+
+  /*!
+   * Size of data written in the cx_data buffer.
+   */
+  size_t frame_size;
+
+  /*!
+   * Flags for the frame.
+   */
+  unsigned int lib_flags;
+
+  /*!
+   * Time stamp for start of frame.
+   */
+  int64_t ts_frame_start;
+
+  /*!
+   * Time stamp for end of frame.
+   */
+  int64_t ts_frame_end;
+
+  /*!
+   * Flag to indicate flush call.
+   */
+  int flush;
+
+  /*!
+   * Time base for sequence.
+   */
+  const aom_rational64_t *timestamp_ratio;
+
+  /*!
+   * Decide to pop the source for this frame from input buffer queue.
+   */
+  int pop_lookahead;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Display order hint of frame whose packed data is in cx_data buffer.
+   */
+  int frame_display_order_hint;
+#endif
+} AV1_COMP_DATA;
+
+/*!
  * \brief Top level primary encoder structure
  */
 typedef struct AV1_PRIMARY {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * Array of frame level encoder stage top level structures
+   */
+  struct AV1_COMP *parallel_cpi[MAX_PARALLEL_FRAMES];
+
+  /*!
+   * Number of frame level contexts(cpis)
+   */
+  int num_fp_contexts;
+
+  /*!
+   * Array of structures to hold data of frames encoded in a given parallel
+   * encode set.
+   */
+  struct AV1_COMP_DATA parallel_frames_data[MAX_PARALLEL_FRAMES - 1];
+
+  /*!
+   * Loopfilter levels of the previous encoded frame.
+   */
+  int filter_level[2];
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_u;
+
+  /*!
+   * Chrominance component loopfilter level of the previous encoded frame.
+   */
+  int filter_level_v;
+
+  /*!
+   * Start time stamp of the last encoded show frame
+   */
+  int64_t ts_start_last_show_frame;
+
+  /*!
+   * End time stamp of the last encoded show frame
+   */
+  int64_t ts_end_last_show_frame;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  /*!
+   * Copy of cm->ref_frame_map maintained to facilitate sequential update of
+   * ref_frame_map by lower layer depth frames encoded ahead of time in a
+   * parallel encode set.
+   */
+  RefCntBuffer *ref_frame_map_copy[REF_FRAMES];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   /*!
    * Encode stage top level structure
+   * When CONFIG_FRAME_PARALLEL_ENCODE is enabled this is the same as
+   * parallel_cpi[0]
    */
   struct AV1_COMP *cpi;
 
@@ -2087,6 +2382,183 @@
    * When set, indicates that internal ARFs are enabled.
    */
   int internal_altref_allowed;
+
+  /*!
+   * Tell if OVERLAY frame shows existing alt_ref frame.
+   */
+  int show_existing_alt_ref;
+
+  /*!
+   * Information related to a gf group.
+   */
+  GF_GROUP gf_group;
+
+  /*!
+   * Track prior gf group state.
+   */
+  GF_STATE gf_state;
+
+  /*!
+   * Flag indicating whether look ahead processing (LAP) is enabled.
+   */
+  int lap_enabled;
+
+  /*!
+   * Parameters for AV1 bitstream levels.
+   */
+  AV1LevelParams level_params;
+
+  /*!
+   * Calculates PSNR on each frame when set to 1.
+   */
+  int b_calculate_psnr;
+
+  /*!
+   * Number of frames left to be encoded, is 0 if limit is not set.
+   */
+  int frames_left;
+
+  /*!
+   * Information related to two pass encoding.
+   */
+  TWO_PASS twopass;
+
+  /*!
+   * Rate control related parameters.
+   */
+  PRIMARY_RATE_CONTROL p_rc;
+
+  /*!
+   * Frame buffer holding the temporally filtered source frame. It can be KEY
+   * frame or ARF frame.
+   */
+  YV12_BUFFER_CONFIG alt_ref_buffer;
+
+  /*!
+   * Elements part of the sequence header, that are applicable for all the
+   * frames in the video.
+   */
+  SequenceHeader seq_params;
+
+  /*!
+   * Indicates whether to use SVC.
+   */
+  int use_svc;
+
+  /*!
+   * If true, buffer removal times are present.
+   */
+  bool buffer_removal_time_present;
+
+  /*!
+   * Number of temporal layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_temporal_layers;
+
+  /*!
+   * Number of spatial layers: may be > 1 for SVC (scalable vector coding).
+   */
+  unsigned int number_spatial_layers;
+
+  /*!
+   * Code and details about current error status.
+   */
+  struct aom_internal_error_info error;
+
+  /*!
+   * Function pointers to variants of sse/sad/variance computation functions.
+   * fn_ptr[i] indicates the list of function pointers corresponding to block
+   * size i.
+   */
+  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+  /*!
+   * Scaling factors used in the RD multiplier modulation.
+   * TODO(sdeng): consider merge the following arrays.
+   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
+   * intermediate scaling factors which are used in the calculation of
+   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
+   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_rdmult_scaling_factors;
+
+  /*!
+   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
+   * the ith 16 x 16 block in raster scan order.
+   */
+  double *tpl_sb_rdmult_scaling_factors;
+
+  /*!
+   * Parameters related to tpl.
+   */
+  TplParams tpl_data;
+
+  /*!
+   * Motion vector stats of the previous encoded frame.
+   */
+  MV_STATS mv_stats;
+
+#if CONFIG_INTERNAL_STATS
+  /*!\cond */
+  uint64_t total_time_receive_data;
+  uint64_t total_time_compress_data;
+
+  unsigned int total_mode_chosen_counts[MAX_MODES];
+
+  int count[2];
+  uint64_t total_sq_error[2];
+  uint64_t total_samples[2];
+  ImageStat psnr[2];
+
+  double total_blockiness;
+  double worst_blockiness;
+
+  int total_bytes;
+  double summed_quality;
+  double summed_weights;
+  double summed_quality_hbd;
+  double summed_weights_hbd;
+  unsigned int total_recode_hits;
+  double worst_ssim;
+  double worst_ssim_hbd;
+
+  ImageStat fastssim;
+  ImageStat psnrhvs;
+
+  int b_calculate_blockiness;
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
+  /*!\endcond */
+#endif
+
+#if CONFIG_ENTROPY_STATS
+  /*!
+   * Aggregates frame counts for the sequence.
+   */
+  FRAME_COUNTS aggregate_fc;
+#endif  // CONFIG_ENTROPY_STATS
+
+  /*!
+   * For each type of reference frame, this contains the index of a reference
+   * frame buffer for a reference frame of the same type.  We use this to
+   * choose our primary reference frame (which is the most recent reference
+   * frame of the same type as the current frame).
+   */
+  int fb_of_context_type[REF_FRAMES];
+
+  /*!
+   * Primary Multi-threading parameters.
+   */
+  PrimaryMultiThreadInfo p_mt_info;
+
+  /*!
+   * Probabilities for pruning of various AV1 tools.
+   */
+  FrameProbInfo frame_probs;
 } AV1_PRIMARY;
 
 /*!
@@ -2143,11 +2615,6 @@
   AV1EncoderConfig oxcf;
 
   /*!
-   * When set, this flag indicates that the current frame is a forward keyframe.
-   */
-  int no_show_fwd_kf;
-
-  /*!
    * Stores the trellis optimization type at segment level.
    * optimize_seg_arr[i] stores the trellis opt type for ith segment.
    */
@@ -2197,9 +2664,9 @@
   YV12_BUFFER_CONFIG *unfiltered_source;
 
   /*!
-   * Parameters related to tpl.
+   * Skip tpl setup when tpl data from gop length decision can be reused.
    */
-  TplParams tpl_data;
+  int skip_tpl_setup_stats;
 
   /*!
    * Temporal filter context.
@@ -2207,11 +2674,6 @@
   TemporalFilterCtx tf_ctx;
 
   /*!
-   * For a still frame, this flag is set to 1 to skip partition search.
-   */
-  int partition_search_skippable_frame;
-
-  /*!
    * Variables related to forcing integer mv decisions for the current frame.
    */
   ForceIntegerMVInfo force_intpel_info;
@@ -2230,15 +2692,7 @@
   /*!
    * Refresh frame flags for golden, bwd-ref and alt-ref frames.
    */
-  RefreshFrameFlagsInfo refresh_frame;
-
-  /*!
-   * For each type of reference frame, this contains the index of a reference
-   * frame buffer for a reference frame of the same type.  We use this to
-   * choose our primary reference frame (which is the most recent reference
-   * frame of the same type as the current frame).
-   */
-  int fb_of_context_type[REF_FRAMES];
+  RefreshFrameInfo refresh_frame;
 
   /*!
    * Flags signalled by the external interface at frame level.
@@ -2340,90 +2794,25 @@
   ActiveMap active_map;
 
   /*!
-   * Function pointers to variants of sse/sad/variance computation functions.
-   * fn_ptr[i] indicates the list of function pointers corresponding to block
-   * size i.
-   */
-  aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
-
-  /*!
-   * Information related to two pass encoding.
-   */
-  TWO_PASS twopass;
-
-  /*!
-   * Information related to a gf group.
-   */
-  GF_GROUP gf_group;
-
-  /*!
    * The frame processing order within a GOP.
    */
   unsigned char gf_frame_index;
 
   /*!
-   * Track prior gf group state.
-   */
-  GF_STATE gf_state;
-
-  /*!
    * To control the reference frame buffer and selection.
    */
   RefBufferStack ref_buffer_stack;
 
-  /*!
-   * Frame buffer holding the temporally filtered source frame. It can be KEY
-   * frame or ARF frame.
-   */
-  YV12_BUFFER_CONFIG alt_ref_buffer;
-
-  /*!
-   * Tell if OVERLAY frame shows existing alt_ref frame.
-   */
-  int show_existing_alt_ref;
-
 #if CONFIG_INTERNAL_STATS
   /*!\cond */
-  uint64_t time_receive_data;
   uint64_t time_compress_data;
 
   unsigned int mode_chosen_counts[MAX_MODES];
-
-  int count[2];
-  uint64_t total_sq_error[2];
-  uint64_t total_samples[2];
-  ImageStat psnr[2];
-
-  double total_blockiness;
-  double worst_blockiness;
-
   int bytes;
-  double summed_quality;
-  double summed_weights;
-  double summed_quality_hbd;
-  double summed_weights_hbd;
-  unsigned int tot_recode_hits;
-  double worst_ssim;
-  double worst_ssim_hbd;
-
-  ImageStat fastssim;
-  ImageStat psnrhvs;
-
-  int b_calculate_blockiness;
-  int b_calculate_consistency;
-
-  double total_inconsistency;
-  double worst_consistency;
-  Ssimv *ssim_vars;
-  Metrics metrics;
+  unsigned int frame_recode_hits;
   /*!\endcond */
 #endif
 
-  /*!
-   * Calculates PSNR on each frame when set to 1.
-   */
-  int b_calculate_psnr;
-
 #if CONFIG_SPEED_STATS
   /*!
    * For debugging: number of transform searches we have performed.
@@ -2490,12 +2879,50 @@
    */
   VarBasedPartitionInfo vbp_info;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
   /*!
-   * Probabilities for pruning of various AV1 tools.
+   * Number of recodes in the frame.
    */
-  FrameProbInfo frame_probs;
+  int num_frame_recode;
 
   /*!
+   * Current frame probability of parallel frames, across recodes.
+   */
+  FrameProbInfo frame_new_probs[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for transform type frame_probability calculation
+   */
+  int do_update_frame_probs_txtype[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for obmc frame_probability calculation
+   */
+  int do_update_frame_probs_obmc[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for warped motion frame_probability calculation
+   */
+  int do_update_frame_probs_warp[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for interpolation filter frame_probability calculation
+   */
+  int do_update_frame_probs_interpfilter[NUM_RECODES_PER_FRAME];
+
+  /*!
+   * Retain condition for fast_extra_bits calculation.
+   */
+  int do_update_vbr_bits_off_target_fast;
+
+  /*!
+   * Updated framerate for the current parallel frame.
+   * cpi->framerate is updated with new_framerate during
+   * post encode updates for parallel frames.
+   */
+  double new_framerate;
+#endif
+  /*!
    * Multi-threading parameters.
    */
   MultiThreadInfo mt_info;
@@ -2569,6 +2996,10 @@
    * component_time[] are initialized to zero while encoder starts.
    */
   uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
   struct aom_usec_timer component_timer[kTimingComponents];
   /*!
    * frame_component_time[] are initialized to zero at beginning of each frame.
@@ -2577,9 +3008,9 @@
 #endif
 
   /*!
-   * Parameters for AV1 bitstream levels.
+   * Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
    */
-  AV1LevelParams level_params;
+  int frame_header_count;
 
   /*!
    * Whether any no-zero delta_q was actually used.
@@ -2592,20 +3023,6 @@
   RefFrameDistanceInfo ref_frame_dist_info;
 
   /*!
-   * Scaling factors used in the RD multiplier modulation.
-   * TODO(sdeng): consider merge the following arrays.
-   * tpl_rdmult_scaling_factors is a temporary buffer used to store the
-   * intermediate scaling factors which are used in the calculation of
-   * tpl_sb_rdmult_scaling_factors. tpl_rdmult_scaling_factors[i] stores the
-   * intermediate scaling factor of the ith 16 x 16 block in raster scan order.
-   */
-  double *tpl_rdmult_scaling_factors;
-  /*!
-   * tpl_sb_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
-   * the ith 16 x 16 block in raster scan order.
-   */
-  double *tpl_sb_rdmult_scaling_factors;
-  /*!
    * ssim_rdmult_scaling_factors[i] stores the RD multiplier scaling factor of
    * the ith 16 x 16 block in raster scan order. This scaling factor is used for
    * RD multiplier modulation when SSIM tuning is enabled.
@@ -2627,30 +3044,16 @@
 #endif
 
   /*!
-   * Indicates whether to use SVC.
-   */
-  int use_svc;
-  /*!
    * Parameters for scalable video coding.
    */
   SVC svc;
 
   /*!
-   * Flag indicating whether look ahead processing (LAP) is enabled.
-   */
-  int lap_enabled;
-  /*!
    * Indicates whether current processing stage is encode stage or LAP stage.
    */
   COMPRESSOR_STAGE compressor_stage;
 
   /*!
-   * Some motion vector stats from the last encoded frame to help us decide what
-   * precision to use to encode the current frame.
-   */
-  MV_STATS mv_stats;
-
-  /*!
    * Frame type of the last frame. May be used in some heuristics for speeding
    * up the encoding.
    */
@@ -2692,11 +3095,6 @@
   uint8_t *consec_zero_mv;
 
   /*!
-   * Number of frames left to be encoded, is 0 if limit is not set.
-   */
-  int frames_left;
-
-  /*!
    * Block size of first pass encoding
    */
   BLOCK_SIZE fp_block_size;
@@ -2706,6 +3104,99 @@
    * This number starts from 0 and increases whenever a super block is encoded.
    */
   int sb_counter;
+
+  /*!
+   * Available bitstream buffer size in bytes
+   */
+  size_t available_bs_size;
+
+  /*!
+   * The controller of the external partition model.
+   * It is used to do partition type selection based on external models.
+   */
+  ExtPartController ext_part_controller;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  /*!
+   * A flag to indicate frames that will update their data to the primary
+   * context at the end of the encode. It is set for non-parallel frames and the
+   * last frame in encode order in a given parallel encode set.
+   */
+  bool do_frame_data_update;
+
+  /*!
+   * Motion vector stats of the current encoded frame, used to update the
+   * ppi->mv_stats during postencode.
+   */
+  MV_STATS mv_stats;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  /*!
+   * Stores the reference refresh index for the current frame.
+   */
+  int ref_refresh_index;
+
+  /*!
+   * A flag to indicate if the reference refresh index is available for the
+   * current frame.
+   */
+  bool refresh_idx_available;
+
+  /*!
+   * Reference frame index corresponding to the frame to be excluded from being
+   * used as a reference by frame_parallel_level 2 frame in a parallel
+   * encode set of lower layer frames.
+   */
+  int ref_idx_to_skip;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_RD_COMMAND
+  /*!
+   *  A structure for assigning external q_index / rdmult for experiments
+   */
+  RD_COMMAND rd_command;
+#endif  // CONFIG_RD_COMMAND
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  WeberStats *mb_weber_stats;
+
+  /*!
+   * Buffer to store MB variance after Wiener filter.
+   */
+  BLOCK_SIZE weber_bsize;
+
+  /*!
+   * Frame level Wiener filter normalization.
+   */
+  int64_t norm_wiener_variance;
+
+  /*!
+   * Buffer to store delta-q values for delta-q mode 4.
+   */
+  int *mb_delta_q;
+
+  /*!
+   * Flag to indicate that current frame is dropped.
+   */
+  bool is_dropped_frame;
+
+#if CONFIG_BITRATE_ACCURACY
+  /*!
+   * Structure stores information needed for bitrate accuracy experiment.
+   */
+  VBR_RATECTRL_INFO vbr_rc_info;
+#endif
+
+  /*!
+   * Frame level twopass status and control data
+   */
+  TWO_PASS_FRAME twopass_frame;
+
+  /*!
+   * Context needed for third pass encoding.
+   */
+  THIRD_PASS_DEC_CTX *third_pass_ctx;
 } AV1_COMP;
 
 /*!
@@ -2764,7 +3255,7 @@
    *  Flags which determine which reference buffers are refreshed by this
    *  frame.
    */
-  RefreshFrameFlagsInfo refresh_frame;
+  RefreshFrameInfo refresh_frame;
 
   /*!
    *  Speed level to use for this frame: Bigger number means faster.
@@ -2785,27 +3276,60 @@
 
 struct AV1_COMP *av1_create_compressor(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf,
                                        BufferPool *const pool,
-                                       FIRSTPASS_STATS *frame_stats_buf,
                                        COMPRESSOR_STAGE stage,
-                                       int num_lap_buffers,
-                                       int lap_lag_in_frames,
-                                       STATS_BUFFER_CTX *stats_buf_context);
+                                       int lap_lag_in_frames);
 
 struct AV1_PRIMARY *av1_create_primary_compressor(
-    struct aom_codec_pkt_list *pkt_list_head);
+    struct aom_codec_pkt_list *pkt_list_head, int num_lap_buffers,
+    AV1EncoderConfig *oxcf);
 
 void av1_remove_compressor(AV1_COMP *cpi);
 
 void av1_remove_primary_compressor(AV1_PRIMARY *ppi);
 
-void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+#if CONFIG_ENTROPY_STATS
+void print_entropy_stats(AV1_PRIMARY *const ppi);
+#endif
+#if CONFIG_INTERNAL_STATS
+void print_internal_stats(AV1_PRIMARY *ppi);
+#endif
+
+void av1_change_config_seq(AV1_PRIMARY *ppi, const AV1EncoderConfig *oxcf,
+                           bool *sb_size_changed);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf,
+                       bool sb_size_changed);
 
 void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
                              int subsampling_x, int subsampling_y);
 
-void av1_init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi,
                                const AV1EncoderConfig *oxcf, int use_svc);
 
+void av1_post_encode_updates(AV1_COMP *const cpi,
+                             const AV1_COMP_DATA *const cpi_data);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+void av1_scale_references_fpmt(AV1_COMP *cpi, int *ref_buffers_used_map);
+
+void av1_increment_scaled_ref_counts_fpmt(BufferPool *buffer_pool,
+                                          int ref_buffers_used_map);
+
+void av1_release_scaled_references_fpmt(AV1_COMP *cpi);
+
+void av1_decrement_ref_counts_fpmt(BufferPool *buffer_pool,
+                                   int ref_buffers_used_map);
+
+void av1_init_sc_decisions(AV1_PRIMARY *const ppi);
+
+AV1_COMP *av1_get_parallel_frame_enc_data(AV1_PRIMARY *const ppi,
+                                          AV1_COMP_DATA *const first_cpi_data);
+
+int av1_init_parallel_frame_context(const AV1_COMP_DATA *const first_cpi_data,
+                                    AV1_PRIMARY *const ppi,
+                                    int *ref_buffers_used_map);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 /*!\endcond */
 
 /*!\brief Obtain the raw frame data
@@ -2835,16 +3359,11 @@
  * \callergraph
  * This function encodes the raw frame data, and outputs the frame bit stream
  * to the designated buffer. The caller should use the output parameters
- * *time_stamp and *time_end only when this function returns AOM_CODEC_OK.
+ * cpi_data->ts_frame_start and cpi_data->ts_frame_end only when this function
+ * returns AOM_CODEC_OK.
  *
- * \param[in]    cpi         Top-level encoder structure
- * \param[in]    frame_flags Flags to decide how to encoding the frame
- * \param[in]    size        Bitstream size
- * \param[in]    dest        Bitstream output
- * \param[out]   time_stamp  Time stamp of the frame
- * \param[out]   time_end    Time end
- * \param[in]    flush       Decide to encode one frame or the rest of frames
- * \param[in]    timebase    Time base used
+ * \param[in]     cpi         Top-level encoder structure
+ * \param[in,out] cpi_data    Data corresponding to a frame encode
  *
  * \return Returns a value to indicate if the encoding is done successfully.
  * \retval #AOM_CODEC_OK
@@ -2852,10 +3371,7 @@
  *     No frame encoded; more input is required.
  * \retval #AOM_CODEC_ERROR
  */
-int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
-                            size_t *size, uint8_t *dest, int64_t *time_stamp,
-                            int64_t *time_end, int flush,
-                            const aom_rational64_t *timebase);
+int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data);
 
 /*!\brief Run 1-pass/2-pass encoding
  *
@@ -2915,6 +3431,47 @@
 
 void av1_update_frame_size(AV1_COMP *cpi);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+typedef struct {
+  int pyr_level;
+  int disp_order;
+} RefFrameMapPair;
+
+static INLINE void init_ref_map_pair(
+    AV1_COMP *cpi, RefFrameMapPair ref_frame_map_pairs[REF_FRAMES]) {
+  if (cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == KF_UPDATE) {
+    memset(ref_frame_map_pairs, -1, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+    return;
+  }
+  memset(ref_frame_map_pairs, 0, sizeof(*ref_frame_map_pairs) * REF_FRAMES);
+  for (int map_idx = 0; map_idx < REF_FRAMES; map_idx++) {
+    // Get reference frame buffer.
+    const RefCntBuffer *const buf = cpi->common.ref_frame_map[map_idx];
+    if (ref_frame_map_pairs[map_idx].disp_order == -1) continue;
+    if (buf == NULL) {
+      ref_frame_map_pairs[map_idx].disp_order = -1;
+      ref_frame_map_pairs[map_idx].pyr_level = -1;
+      continue;
+    } else if (buf->ref_count > 1) {
+      // Once the keyframe is coded, the slots in ref_frame_map will all
+      // point to the same frame. In that case, all subsequent pointers
+      // matching the current are considered "free" slots. This will find
+      // the next occurrence of the current pointer if ref_count indicates
+      // there are multiple instances of it and mark it as free.
+      for (int idx2 = map_idx + 1; idx2 < REF_FRAMES; ++idx2) {
+        const RefCntBuffer *const buf2 = cpi->common.ref_frame_map[idx2];
+        if (buf2 == buf) {
+          ref_frame_map_pairs[idx2].disp_order = -1;
+          ref_frame_map_pairs[idx2].pyr_level = -1;
+        }
+      }
+    }
+    ref_frame_map_pairs[map_idx].disp_order = (int)buf->display_order_hint;
+    ref_frame_map_pairs[map_idx].pyr_level = buf->pyramid_level;
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 // TODO(jingning): Move these functions as primitive members for the new cpi
 // class.
 static INLINE void stack_push(int *stack, int *stack_size, int item) {
@@ -2962,7 +3519,7 @@
 }
 
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const FRAME_UPDATE_TYPE update_type =
       gf_group->update_type[cpi->gf_frame_index];
 
@@ -3023,22 +3580,33 @@
   return lag_in_frames >= ALT_MIN_LAG && enable_auto_arf;
 }
 
+static AOM_INLINE int can_disable_altref(const GFConfig *gf_cfg) {
+  return is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) &&
+         (gf_cfg->gf_min_pyr_height == 0);
+}
+
+// Helper function to compute number of blocks on either side of the frame.
+static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
+  return (frame_length + mb_length - 1) / mb_length;
+}
+
 // Check if statistics generation stage
 static INLINE int is_stat_generation_stage(const AV1_COMP *const cpi) {
   assert(IMPLIES(cpi->compressor_stage == LAP_STAGE,
-                 cpi->oxcf.pass == 0 && cpi->lap_enabled));
-  return (cpi->oxcf.pass == 1 || (cpi->compressor_stage == LAP_STAGE));
+                 cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->ppi->lap_enabled));
+  return (cpi->oxcf.pass == AOM_RC_FIRST_PASS ||
+          (cpi->compressor_stage == LAP_STAGE));
 }
 // Check if statistics consumption stage
 static INLINE int is_stat_consumption_stage_twopass(const AV1_COMP *const cpi) {
-  return (cpi->oxcf.pass == 2);
+  return (cpi->oxcf.pass >= AOM_RC_SECOND_PASS);
 }
 
 // Check if statistics consumption stage
 static INLINE int is_stat_consumption_stage(const AV1_COMP *const cpi) {
   return (is_stat_consumption_stage_twopass(cpi) ||
-          (cpi->oxcf.pass == 0 && (cpi->compressor_stage == ENCODE_STAGE) &&
-           cpi->lap_enabled));
+          (cpi->oxcf.pass == AOM_RC_ONE_PASS &&
+           (cpi->compressor_stage == ENCODE_STAGE) && cpi->ppi->lap_enabled));
 }
 
 /*!\endcond */
@@ -3051,11 +3619,18 @@
  * \return 0 if no stats for current stage else 1
  */
 static INLINE int has_no_stats_stage(const AV1_COMP *const cpi) {
-  assert(IMPLIES(!cpi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
-  return (cpi->oxcf.pass == 0 && !cpi->lap_enabled);
+  assert(
+      IMPLIES(!cpi->ppi->lap_enabled, cpi->compressor_stage == ENCODE_STAGE));
+  return (cpi->oxcf.pass == AOM_RC_ONE_PASS && !cpi->ppi->lap_enabled);
 }
+
 /*!\cond */
 
+static INLINE int is_one_pass_rt_params(const AV1_COMP *cpi) {
+  return has_no_stats_stage(cpi) && cpi->oxcf.mode == REALTIME &&
+         cpi->oxcf.gf_cfg.lag_in_frames == 0;
+}
+
 // Function return size of frame stats buffer
 static INLINE int get_stats_buf_size(int num_lap_buffer, int num_lag_buffer) {
   /* if lookahead is enabled return num_lap_buffers else num_lag_buffers */
@@ -3186,6 +3761,7 @@
     };
 
 static INLINE int get_ref_frame_flags(const SPEED_FEATURES *const sf,
+                                      const int use_one_pass_rt_params,
                                       const YV12_BUFFER_CONFIG **ref_frames,
                                       const int ext_ref_frame_flags) {
   // cpi->ext_flags.ref_frame_flags allows certain reference types to be
@@ -3197,12 +3773,12 @@
   for (int i = 1; i < INTER_REFS_PER_FRAME; ++i) {
     const YV12_BUFFER_CONFIG *const this_ref = ref_frames[i];
     // If this_ref has appeared before, mark the corresponding ref frame as
-    // invalid. For nonrd mode, only disable GOLDEN_FRAME if it's the same
-    // as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
-    int index = (sf->rt_sf.use_nonrd_pick_mode &&
-                 ref_frame_priority_order[i] == GOLDEN_FRAME)
-                    ? (1 + sf->rt_sf.use_nonrd_altref_frame)
-                    : i;
+    // invalid. For one_pass_rt mode, only disable GOLDEN_FRAME if it's the
+    // same as LAST_FRAME or ALTREF_FRAME (if ALTREF is being used in nonrd).
+    int index =
+        (use_one_pass_rt_params && ref_frame_priority_order[i] == GOLDEN_FRAME)
+            ? (1 + sf->rt_sf.use_nonrd_altref_frame)
+            : i;
     for (int j = 0; j < index; ++j) {
       if (this_ref == ref_frames[j]) {
         flags &= ~(1 << (ref_frame_priority_order[i] - 1));
@@ -3222,7 +3798,7 @@
 // Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
 // the obu_has_size_field bit is set, and the buffer contains the obu_size
 // field.
-aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+aom_fixed_buf_t *av1_get_global_headers(AV1_PRIMARY *ppi);
 
 #define MAX_GFUBOOST_FACTOR 10.0
 #define MIN_GFUBOOST_FACTOR 4.0
@@ -3255,14 +3831,23 @@
 static AOM_INLINE int is_psnr_calc_enabled(const AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
 
-  return cpi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
+  return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) &&
          cm->show_frame;
 }
 
+static INLINE int is_frame_resize_pending(AV1_COMP *const cpi) {
+  ResizePendingParams *const resize_pending_params =
+      &cpi->resize_pending_params;
+  return (resize_pending_params->width && resize_pending_params->height &&
+          (cpi->common.width != resize_pending_params->width ||
+           cpi->common.height != resize_pending_params->height));
+}
+
 #if CONFIG_AV1_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct AV1_COMP *const cpi) {
-  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
-                                                cpi->svc.first_layer_denoise));
+  return (!cpi->ppi->use_svc ||
+          (cpi->ppi->use_svc &&
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
 }
 #endif
 

diff --git a/av1/encoder/encoder_alloc.h b/av1/encoder/encoder_alloc.h
index ef24a31..01719c5 100644
--- a/av1/encoder/encoder_alloc.h
+++ b/av1/encoder/encoder_alloc.h

@@ -12,8 +12,10 @@
 #ifndef AOM_AV1_ENCODER_ENCODER_ALLOC_H_
 #define AOM_AV1_ENCODER_ENCODER_ALLOC_H_
 
+#include "av1/encoder/block.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -56,7 +58,7 @@
   TokenInfo *token_info = &cpi->token_info;
 
   if (av1_alloc_context_buffers(cm, cm->width, cm->height)) {
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate context buffers");
   }
 
@@ -85,7 +87,8 @@
   CHECK_MEM_ERROR(cm, cpi->td.mb.dv_costs,
                   (IntraBCMVCosts *)aom_malloc(sizeof(*cpi->td.mb.dv_costs)));
 
-  av1_setup_shared_coeff_buffer(&cpi->common, &cpi->td.shared_coeff_buf);
+  av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf,
+                                cm->error);
   av1_setup_sms_tree(cpi, &cpi->td);
   cpi->td.firstpass_ctx =
       av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf);
@@ -112,20 +115,20 @@
                   aom_calloc(mi_params->mi_rows * mi_params->mi_cols, 1));
 }
 
-static AOM_INLINE void alloc_obmc_buffers(OBMCBuffer *obmc_buffer,
-                                          AV1_COMMON *cm) {
-  CHECK_MEM_ERROR(
-      cm, obmc_buffer->wsrc,
+static AOM_INLINE void alloc_obmc_buffers(
+    OBMCBuffer *obmc_buffer, struct aom_internal_error_info *error) {
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->wsrc,
       (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->wsrc)));
-  CHECK_MEM_ERROR(
-      cm, obmc_buffer->mask,
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->mask,
       (int32_t *)aom_memalign(16, MAX_SB_SQUARE * sizeof(*obmc_buffer->mask)));
-  CHECK_MEM_ERROR(
-      cm, obmc_buffer->above_pred,
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->above_pred,
       (uint8_t *)aom_memalign(
           16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->above_pred)));
-  CHECK_MEM_ERROR(
-      cm, obmc_buffer->left_pred,
+  AOM_CHECK_MEM_ERROR(
+      error, obmc_buffer->left_pred,
       (uint8_t *)aom_memalign(
           16, MAX_MB_PLANE * MAX_SB_SQUARE * sizeof(*obmc_buffer->left_pred)));
 }
@@ -143,22 +146,22 @@
 }
 
 static AOM_INLINE void alloc_compound_type_rd_buffers(
-    AV1_COMMON *const cm, CompoundTypeRdBuffers *const bufs) {
-  CHECK_MEM_ERROR(
-      cm, bufs->pred0,
+    struct aom_internal_error_info *error, CompoundTypeRdBuffers *const bufs) {
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred0,
       (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
-  CHECK_MEM_ERROR(
-      cm, bufs->pred1,
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->pred1,
       (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->residual1,
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->residual1,
       (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
-  CHECK_MEM_ERROR(
-      cm, bufs->diff10,
+  AOM_CHECK_MEM_ERROR(
+      error, bufs->diff10,
       (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
-  CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
-                  (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
-                                        sizeof(*bufs->tmp_best_mask_buf)));
+  AOM_CHECK_MEM_ERROR(error, bufs->tmp_best_mask_buf,
+                      (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+                                            sizeof(*bufs->tmp_best_mask_buf)));
 }
 
 static AOM_INLINE void release_compound_type_rd_buffers(
@@ -193,12 +196,6 @@
   aom_free(cpi->ssim_rdmult_scaling_factors);
   cpi->ssim_rdmult_scaling_factors = NULL;
 
-  aom_free(cpi->tpl_rdmult_scaling_factors);
-  cpi->tpl_rdmult_scaling_factors = NULL;
-
-  aom_free(cpi->tpl_sb_rdmult_scaling_factors);
-  cpi->tpl_sb_rdmult_scaling_factors = NULL;
-
 #if CONFIG_TUNE_VMAF
   aom_free(cpi->vmaf_info.rdmult_scaling_factors);
   cpi->vmaf_info.rdmult_scaling_factors = NULL;
@@ -224,9 +221,6 @@
     cpi->td.mb.dv_costs = NULL;
   }
 
-  aom_free(cpi->td.mb.inter_modes_info);
-  cpi->td.mb.inter_modes_info = NULL;
-
   for (int i = 0; i < 2; i++)
     for (int j = 0; j < 2; j++) {
       aom_free(cpi->td.mb.intrabc_hash_info.hash_value_buffer[i][j]);
@@ -236,6 +230,11 @@
   aom_free(cm->tpl_mvs);
   cm->tpl_mvs = NULL;
 
+  if (cpi->td.pixel_gradient_info) {
+    aom_free(cpi->td.pixel_gradient_info);
+    cpi->td.pixel_gradient_info = NULL;
+  }
+
   if (cpi->td.vt64x64) {
     aom_free(cpi->td.vt64x64);
     cpi->td.vt64x64 = NULL;
@@ -244,7 +243,6 @@
   av1_free_pmc(cpi->td.firstpass_ctx, av1_num_planes(cm));
   cpi->td.firstpass_ctx = NULL;
 
-  av1_free_ref_frame_buffers(cm->buffer_pool);
   av1_free_txb_buf(cpi);
   av1_free_context_buffers(cm);
 
@@ -252,10 +250,17 @@
 #if !CONFIG_REALTIME_ONLY
   av1_free_restoration_buffers(cm);
 #endif
+
+  if (!is_stat_generation_stage(cpi)) {
+    int num_cdef_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_CDEF);
+    av1_free_cdef_buffers(cm, &cpi->ppi->p_mt_info.cdef_worker,
+                          &cpi->mt_info.cdef_sync, num_cdef_workers);
+  }
+
   aom_free_frame_buffer(&cpi->trial_frame_rst);
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
-  aom_free_frame_buffer(&cpi->alt_ref_buffer);
 
   free_token_info(token_info);
 
@@ -280,21 +285,37 @@
     cpi->film_grain_table = NULL;
   }
 
-  for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
-    aom_free(cpi->level_params.level_info[i]);
-  }
-
-  if (cpi->use_svc) av1_free_svc_cyclic_refresh(cpi);
+  if (cpi->ppi->use_svc) av1_free_svc_cyclic_refresh(cpi);
 
   if (cpi->consec_zero_mv) {
     aom_free(cpi->consec_zero_mv);
     cpi->consec_zero_mv = NULL;
   }
+
+  aom_free(cpi->mb_weber_stats);
+  cpi->mb_weber_stats = NULL;
+
+  aom_free(cpi->mb_delta_q);
+  cpi->mb_delta_q = NULL;
+}
+
+static AOM_INLINE void allocate_gradient_info_for_hog(
+    PixelLevelGradientInfo **pixel_gradient_info, AV1_COMP *cpi) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  if (!*pixel_gradient_info) {
+    const int plane_types = PLANE_TYPES >> cm->seq_params->monochrome;
+    CHECK_MEM_ERROR(cm, *pixel_gradient_info,
+                    aom_malloc(sizeof(**pixel_gradient_info) * plane_types *
+                               MAX_SB_SQUARE));
+  }
+
+  cpi->td.mb.pixel_gradient_info = *pixel_gradient_info;
 }
 
 static AOM_INLINE void variance_partition_alloc(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int num_64x64_blocks = (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+  const int num_64x64_blocks = (cm->seq_params->sb_size == BLOCK_64X64) ? 1 : 4;
   if (cpi->td.vt64x64) {
     if (num_64x64_blocks != cpi->td.num_64x64_blocks) {
       aom_free(cpi->td.vt64x64);
@@ -310,7 +331,7 @@
 
 static AOM_INLINE void alloc_altref_frame_buffer(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
 
   // When lag_in_frames <= 1, alt-ref frames are not enabled. In this case,
@@ -320,29 +341,29 @@
 
   // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (aom_realloc_frame_buffer(
-          &cpi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
+          &cpi->ppi->alt_ref_buffer, oxcf->frm_dim_cfg.width,
           oxcf->frm_dim_cfg.height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL,
           NULL, cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
 
 static AOM_INLINE void alloc_util_frame_buffers(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int byte_alignment = cm->features.byte_alignment;
   if (aom_realloc_frame_buffer(
           &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL, 0))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
   // The frame buffer trial_frame_rst is used during loop restoration filter
   // search. Hence it is allocated only when loop restoration is used.
-  const int use_restoration = cm->seq_params.enable_restoration &&
+  const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
   if (use_restoration) {
@@ -351,7 +372,7 @@
             cm->superres_upscaled_height, seq_params->subsampling_x,
             seq_params->subsampling_y, seq_params->use_highbitdepth,
             AOM_RESTORATION_FRAME_BORDER, byte_alignment, NULL, NULL, NULL, 0))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate trial restored frame buffer");
   }
 
@@ -360,7 +381,7 @@
           seq_params->subsampling_y, seq_params->use_highbitdepth,
           cpi->oxcf.border_in_pixels, byte_alignment, NULL, NULL, NULL,
           cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
   // The frame buffer cpi->scaled_last_source is used to hold the previous
@@ -376,7 +397,7 @@
             seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
             byte_alignment, NULL, NULL, NULL,
             cpi->oxcf.tool_cfg.enable_global_motion))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+      aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate scaled last source buffer");
   }
 }
@@ -393,16 +414,16 @@
 
   if (aom_realloc_frame_buffer(
           &cpi->scaled_source, scaled_width, scaled_height,
-          cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-          cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+          cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+          cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
           cm->features.byte_alignment, NULL, NULL, NULL,
           cpi->oxcf.tool_cfg.enable_global_motion))
-    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
   assert(cpi->scaled_source.y_crop_width == scaled_width);
   assert(cpi->scaled_source.y_crop_height == scaled_height);
   av1_resize_and_extend_frame_nonnormative(
-      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params.bit_depth,
+      cpi->unscaled_source, &cpi->scaled_source, (int)cm->seq_params->bit_depth,
       num_planes);
   return &cpi->scaled_source;
 }

diff --git a/av1/encoder/encoder_utils.c b/av1/encoder/encoder_utils.c
index db6a763..66cd272 100644
--- a/av1/encoder/encoder_utils.c
+++ b/av1/encoder/encoder_utils.c

@@ -11,8 +11,6 @@
 
 #include "aom/aomcx.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
@@ -25,6 +23,7 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/superres_scale.h"
+#include "av1/encoder/tpl_model.h"
 #include "av1/encoder/var_based_part.h"
 
 #if CONFIG_TUNE_VMAF
@@ -311,7 +310,8 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   struct segmentation *const seg = &cm->seg;
 
-  int high_q = (int)(rc->avg_q > 48.0);
+  double avg_q = cpi->ppi->p_rc.avg_q;
+  int high_q = (int)(avg_q > 48.0);
   int qi_delta;
 
   // Disable and clear down for KF
@@ -343,8 +343,8 @@
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
-                                    cm->seq_params.bit_depth);
+      qi_delta = av1_compute_qdelta(rc, avg_q, avg_q * 0.875,
+                                    cm->seq_params->bit_depth);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
       av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
@@ -459,13 +459,13 @@
 
 #if !CONFIG_REALTIME_ONLY
 static void process_tpl_stats_frame(AV1_COMP *cpi) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   AV1_COMMON *const cm = &cpi->common;
 
   assert(IMPLIES(gf_group->size > 0, cpi->gf_frame_index < gf_group->size));
 
   const int tpl_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
   TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
 
@@ -495,27 +495,26 @@
     if (mc_dep_cost_base == 0) {
       tpl_frame->is_valid = 0;
     } else {
-      aom_clear_system_state();
       cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
       if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
-        if (cpi->lap_enabled) {
-          double min_boost_factor = sqrt(cpi->rc.baseline_gf_interval);
+        if (cpi->ppi->lap_enabled) {
+          double min_boost_factor = sqrt(cpi->ppi->p_rc.baseline_gf_interval);
           const int gfu_boost = get_gfu_boost_from_r0_lap(
               min_boost_factor, MAX_GFUBOOST_FACTOR, cpi->rd.r0,
-              cpi->rc.num_stats_required_for_gfu_boost);
+              cpi->ppi->p_rc.num_stats_required_for_gfu_boost);
           // printf("old boost %d new boost %d\n", cpi->rc.gfu_boost,
           //        gfu_boost);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
-              min_boost_factor, MAX_BOOST_COMBINE_FACTOR, cpi->rc.gfu_boost,
-              gfu_boost, cpi->rc.num_stats_used_for_gfu_boost);
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
+              min_boost_factor, MAX_BOOST_COMBINE_FACTOR,
+              cpi->ppi->p_rc.gfu_boost, gfu_boost,
+              cpi->ppi->p_rc.num_stats_used_for_gfu_boost);
         } else {
           const int gfu_boost = (int)(200.0 / cpi->rd.r0);
-          cpi->rc.gfu_boost = combine_prior_with_tpl_boost(
+          cpi->ppi->p_rc.gfu_boost = combine_prior_with_tpl_boost(
               MIN_BOOST_COMBINE_FACTOR, MAX_BOOST_COMBINE_FACTOR,
-              cpi->rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
+              cpi->ppi->p_rc.gfu_boost, gfu_boost, cpi->rc.frames_to_key);
         }
       }
-      aom_clear_system_state();
     }
   }
 }
@@ -529,7 +528,7 @@
   av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
 
 #if !CONFIG_REALTIME_ONLY
-  GF_GROUP *gf_group = &cpi->gf_group;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   if (cpi->oxcf.algo_cfg.enable_tpl_model &&
       is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
     process_tpl_stats_frame(cpi);
@@ -538,8 +537,48 @@
 #endif
 
   // Decide q and q bounds.
-  *q = av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height,
-                                cpi->gf_frame_index, bottom_index, top_index);
+  *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, cpi->gf_frame_index,
+                                bottom_index, top_index);
+
+#if !CONFIG_REALTIME_ONLY
+  if (cpi->oxcf.rc_cfg.mode == AOM_Q &&
+      cpi->ppi->tpl_data.tpl_frame[cpi->gf_frame_index].is_valid &&
+      is_frame_tpl_eligible(gf_group, cpi->gf_frame_index) &&
+      !is_lossless_requested(&cpi->oxcf.rc_cfg) && !frame_is_intra_only(cm)) {
+    *q = av1_tpl_get_q_index(&cpi->ppi->tpl_data, cpi->gf_frame_index,
+                             cpi->rc.active_worst_quality,
+                             cm->seq_params->bit_depth);
+    *top_index = *bottom_index = *q;
+    if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE)
+      cpi->ppi->p_rc.arf_q = *q;
+  }
+
+  if (cpi->oxcf.q_cfg.use_fixed_qp_offsets && cpi->oxcf.rc_cfg.mode == AOM_Q) {
+    if (is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) {
+      const double qratio_grad =
+          cpi->ppi->p_rc.baseline_gf_interval > 20 ? 0.2 : 0.3;
+      const double qstep_ratio =
+          0.2 +
+          (1.0 - (double)cpi->rc.active_worst_quality / MAXQ) * qratio_grad;
+      *q = av1_get_q_index_from_qstep_ratio(
+          cpi->rc.active_worst_quality, qstep_ratio, cm->seq_params->bit_depth);
+      *top_index = *bottom_index = *q;
+      if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE ||
+          gf_group->update_type[cpi->gf_frame_index] == GF_UPDATE)
+        cpi->ppi->p_rc.arf_q = *q;
+    } else if (gf_group->layer_depth[cpi->gf_frame_index] <
+               gf_group->max_layer_depth) {
+      int this_height = gf_group->layer_depth[cpi->gf_frame_index];
+      int arf_q = cpi->ppi->p_rc.arf_q;
+      while (this_height > 1) {
+        arf_q = (arf_q + cpi->oxcf.rc_cfg.cq_level + 1) / 2;
+        --this_height;
+      }
+      *top_index = *bottom_index = *q = arf_q;
+    }
+  }
+#endif
 
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
@@ -564,6 +603,23 @@
   memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
 }
 
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  const TuneCfg *const tune_cfg = &oxcf->tune_cfg;
+
+  if (tune_cfg->film_grain_test_vector || tune_cfg->film_grain_table_filename ||
+      tune_cfg->content == AOM_CONTENT_FILM) {
+    seq_params->film_grain_params_present = 1;
+  } else {
+#if CONFIG_DENOISE
+    seq_params->film_grain_params_present = (oxcf->noise_level > 0);
+#else
+    seq_params->film_grain_params_present = 0;
+#endif
+  }
+}
+
 void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
                                       const AV1EncoderConfig *oxcf) {
   AV1_COMMON *const cm = &cpi->common;
@@ -577,39 +633,30 @@
   }
 
   if (tune_cfg->film_grain_test_vector) {
-    cm->seq_params.film_grain_params_present = 1;
     if (cm->current_frame.frame_type == KEY_FRAME) {
       memcpy(&cm->film_grain_params,
              film_grain_test_vectors + tune_cfg->film_grain_test_vector - 1,
              sizeof(cm->film_grain_params));
       if (oxcf->tool_cfg.enable_monochrome)
         reset_film_grain_chroma_params(&cm->film_grain_params);
-      cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
-      if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+      cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
+      if (cm->seq_params->color_range == AOM_CR_FULL_RANGE) {
         cm->film_grain_params.clip_to_restricted_range = 0;
       }
     }
   } else if (tune_cfg->film_grain_table_filename) {
-    cm->seq_params.film_grain_params_present = 1;
-
     cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
     memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
 
     aom_film_grain_table_read(cpi->film_grain_table,
-                              tune_cfg->film_grain_table_filename, &cm->error);
+                              tune_cfg->film_grain_table_filename, cm->error);
   } else if (tune_cfg->content == AOM_CONTENT_FILM) {
-    cm->seq_params.film_grain_params_present = 1;
-    cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+    cm->film_grain_params.bit_depth = cm->seq_params->bit_depth;
     if (oxcf->tool_cfg.enable_monochrome)
       reset_film_grain_chroma_params(&cm->film_grain_params);
-    if (cm->seq_params.color_range == AOM_CR_FULL_RANGE)
+    if (cm->seq_params->color_range == AOM_CR_FULL_RANGE)
       cm->film_grain_params.clip_to_restricted_range = 0;
   } else {
-#if CONFIG_DENOISE
-    cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
-#else
-    cm->seq_params.film_grain_params_present = 0;
-#endif
     memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
   }
 }
@@ -643,7 +690,7 @@
           if (aom_yv12_realloc_with_new_border(
                   &ref_fb->buf, AOM_BORDER_IN_PIXELS,
                   cm->features.byte_alignment, num_planes) != 0) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
         }
@@ -652,7 +699,7 @@
         if (new_fb == NULL) {
           const int new_fb_idx = get_free_fb(cm);
           if (new_fb_idx == INVALID_IDX) {
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Unable to find free frame buffer");
           }
           force_scaling = 1;
@@ -663,30 +710,30 @@
             new_fb->buf.y_crop_height != cm->height) {
           if (aom_realloc_frame_buffer(
                   &new_fb->buf, cm->width, cm->height,
-                  cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-                  cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->seq_params->subsampling_x, cm->seq_params->subsampling_y,
+                  cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
                   cm->features.byte_alignment, NULL, NULL, NULL, 0)) {
             if (force_scaling) {
               // Release the reference acquired in the get_free_fb() call above.
               --new_fb->ref_count;
             }
-            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+            aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
           }
 #if CONFIG_AV1_HIGHBITDEPTH
-          if (use_optimized_scaler && cm->seq_params.bit_depth == AOM_BITS_8)
+          if (use_optimized_scaler && cm->seq_params->bit_depth == AOM_BITS_8)
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
           else
             av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
 #else
           if (use_optimized_scaler)
             av1_resize_and_extend_frame(ref, &new_fb->buf, filter, phase,
                                         num_planes);
           else
             av1_resize_and_extend_frame_nonnormative(
-                ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+                ref, &new_fb->buf, (int)cm->seq_params->bit_depth, num_planes);
 #endif
           cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
@@ -704,18 +751,23 @@
   }
 }
 
-BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi) {
-  const AV1_COMMON *const cm = &cpi->common;
-  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers) {
   if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
     return BLOCK_64X64;
   if (oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
     return BLOCK_128X128;
 
+  // Force 64x64 superblock size to increase resolution in perceptual
+  // AQ mode.
+  if (oxcf->mode == ALLINTRA &&
+      (oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL_AI ||
+       oxcf->q_cfg.deltaq_mode == DELTA_Q_USER_RATING_BASED))
+    return BLOCK_64X64;
+
   assert(oxcf->tool_cfg.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
 
-  if (cpi->svc.number_spatial_layers > 1 ||
+  if (number_spatial_layers > 1 ||
       oxcf->resize_cfg.resize_mode != RESIZE_NONE) {
     // Use the configured size (top resolution) for spatial layers or
     // on resize.
@@ -731,8 +783,9 @@
   // pass encoding, which is why this heuristic is not configured as a
   // speed-feature.
   if (oxcf->superres_cfg.superres_mode == AOM_SUPERRES_NONE &&
-      oxcf->resize_cfg.resize_mode == RESIZE_NONE && oxcf->speed >= 1) {
-    return AOMMIN(cm->width, cm->height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
+      oxcf->resize_cfg.resize_mode == RESIZE_NONE &&
+      (oxcf->speed >= 1 || oxcf->mode == REALTIME)) {
+    return AOMMIN(width, height) > 480 ? BLOCK_128X128 : BLOCK_64X64;
   }
 
   return BLOCK_128X128;
@@ -754,7 +807,9 @@
   if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
       frame_is_sframe(cm)) {
     if (!cpi->ppi->seq_params_locked) {
-      set_sb_size(&cm->seq_params, av1_select_sb_size(cpi));
+      set_sb_size(cm->seq_params,
+                  av1_select_sb_size(&cpi->oxcf, cm->width, cm->height,
+                                     cpi->svc.number_spatial_layers));
     }
   } else {
     const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
@@ -923,7 +978,6 @@
       cpi->sf.part_sf.fixed_partition_size;
 
   // Setup necessary params for encoding, including frame source, etc.
-  aom_clear_system_state();
 
   cpi->source =
       av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source,
@@ -955,11 +1009,11 @@
     set_encoding_params_for_screen_content(cpi, pass);
     av1_set_quantizer(cm, q_cfg->qm_minlevel, q_cfg->qm_maxlevel,
                       q_for_screen_content_quick_run,
-                      q_cfg->enable_chroma_deltaq);
+                      q_cfg->enable_chroma_deltaq, q_cfg->enable_hdr_deltaq);
     av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
     if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
       av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                         cm->seq_params.bit_depth);
+                         cm->seq_params->bit_depth);
 
     av1_set_variance_partition_thresholds(cpi, q_for_screen_content_quick_run,
                                           0);
@@ -1005,13 +1059,13 @@
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
 
-  if (!cm->seq_params.reduced_still_picture_hdr &&
+  if (!cm->seq_params->reduced_still_picture_hdr &&
       encode_show_existing_frame(cm)) {
     RefCntBuffer *const frame_to_show =
         cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
     if (frame_to_show == NULL) {
-      aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+      aom_internal_error(cm->error, AOM_CODEC_UNSUP_BITSTREAM,
                          "Buffer does not contain a reconstructed frame");
     }
     assert(frame_to_show->ref_count > 0);
@@ -1019,7 +1073,7 @@
   }
 
   if (!encode_show_existing_frame(cm) &&
-      cm->seq_params.film_grain_params_present &&
+      cm->seq_params->film_grain_params_present &&
       (cm->show_frame || cm->showable_frame)) {
     // Copy the current frame's film grain params to the its corresponding
     // RefCntBuffer slot.
@@ -1049,7 +1103,6 @@
 int av1_is_integer_mv(const YV12_BUFFER_CONFIG *cur_picture,
                       const YV12_BUFFER_CONFIG *last_picture,
                       ForceIntegerMVInfo *const force_intpel_info) {
-  aom_clear_system_state();
   // check use hash ME
   int k;
 
@@ -1232,7 +1285,7 @@
   cc->lf = cm->lf;
   cc->cdef_info = cm->cdef_info;
   cc->rc = cpi->rc;
-  cc->mv_stats = cpi->mv_stats;
+  cc->mv_stats = cpi->ppi->mv_stats;
 }
 
 void av1_save_all_coding_context(AV1_COMP *cpi) {
@@ -1302,10 +1355,10 @@
       "refresh_alt_ref_frame=%d, "
       "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
       current_frame->frame_number, cpi->gf_frame_index,
-      cpi->gf_group.update_type[cpi->gf_frame_index], current_frame->order_hint,
-      cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active,
-      cpi->refresh_frame.alt_ref_frame, recon_buf->y_stride,
-      recon_buf->uv_stride, cm->width, cm->height);
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
+      current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
+      cpi->rc.source_alt_ref_active, cpi->refresh_frame.alt_ref_frame,
+      recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
 #if 0
   int ref_frame;
   printf("get_ref_frame_map_idx: [");

diff --git a/av1/encoder/encoder_utils.h b/av1/encoder/encoder_utils.h
index 40652e9..999c9bf 100644
--- a/av1/encoder/encoder_utils.h
+++ b/av1/encoder/encoder_utils.h

@@ -73,10 +73,6 @@
 
   assert(mi_size_wide[mi_params->mi_alloc_bsize] ==
          mi_size_high[mi_params->mi_alloc_bsize]);
-
-#if CONFIG_LPF_MASK
-  av1_alloc_loop_filter_mask(mi_params);
-#endif
 }
 
 static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) {
@@ -125,14 +121,14 @@
 }
 
 #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
-  cpi->fn_ptr[BT].jsdaf = JSDAF;                                       \
-  cpi->fn_ptr[BT].jsvaf = JSVAF;
+  ppi->fn_ptr[BT].sdf = SDF;                                           \
+  ppi->fn_ptr[BT].sdaf = SDAF;                                         \
+  ppi->fn_ptr[BT].vf = VF;                                             \
+  ppi->fn_ptr[BT].svf = SVF;                                           \
+  ppi->fn_ptr[BT].svaf = SVAF;                                         \
+  ppi->fn_ptr[BT].sdx4df = SDX4DF;                                     \
+  ppi->fn_ptr[BT].jsdaf = JSDAF;                                       \
+  ppi->fn_ptr[BT].jsvaf = JSVAF;
 
 #define HIGHBD_BFP_WRAPPER(WIDTH, HEIGHT, BD)                                \
   HIGHBD_BFP(                                                                \
@@ -325,8 +321,8 @@
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
-  cpi->fn_ptr[BT].msdf = MCSDF;       \
-  cpi->fn_ptr[BT].msvf = MCSVF;
+  ppi->fn_ptr[BT].msdf = MCSDF;       \
+  ppi->fn_ptr[BT].msvf = MCSVF;
 
 #define HIGHBD_MBFP_WRAPPER(WIDTH, HEIGHT, BD)                    \
   HIGHBD_MBFP(BLOCK_##WIDTH##X##HEIGHT,                           \
@@ -386,8 +382,8 @@
 #endif
 
 #define HIGHBD_SDSFP(BT, SDSF, SDSX4DF) \
-  cpi->fn_ptr[BT].sdsf = SDSF;          \
-  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+  ppi->fn_ptr[BT].sdsf = SDSF;          \
+  ppi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define HIGHBD_SDSFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
   HIGHBD_SDSFP(BLOCK_##WIDTH##X##HEIGHT,                          \
@@ -487,9 +483,9 @@
               aom_highbd_obmc_sub_pixel_variance##WIDTH##x##HEIGHT)
 
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
-  cpi->fn_ptr[BT].osdf = OSDF;           \
-  cpi->fn_ptr[BT].ovf = OVF;             \
-  cpi->fn_ptr[BT].osvf = OSVF;
+  ppi->fn_ptr[BT].osdf = OSDF;           \
+  ppi->fn_ptr[BT].ovf = OVF;             \
+  ppi->fn_ptr[BT].osvf = OSVF;
 
 #define HIGHBD_OBFP_WRAPPER(WIDTH, HEIGHT, BD)                   \
   HIGHBD_OBFP(BLOCK_##WIDTH##X##HEIGHT,                          \
@@ -542,10 +538,10 @@
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
 #endif
 
-static AOM_INLINE void highbd_set_var_fns(AV1_COMP *const cpi) {
-  AV1_COMMON *const cm = &cpi->common;
-  if (cm->seq_params.use_highbitdepth) {
-    switch (cm->seq_params.bit_depth) {
+static AOM_INLINE void highbd_set_var_fns(AV1_PRIMARY *const ppi) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  if (seq_params->use_highbitdepth) {
+    switch (seq_params->bit_depth) {
       case AOM_BITS_8:
 #if !CONFIG_REALTIME_ONLY
         HIGHBD_BFP_WRAPPER(64, 16, 8)
@@ -850,7 +846,7 @@
 
       default:
         assert(0 &&
-               "cm->seq_params.bit_depth should be AOM_BITS_8, "
+               "cm->seq_params->bit_depth should be AOM_BITS_8, "
                "AOM_BITS_10 or AOM_BITS_12");
     }
   }
@@ -858,7 +854,7 @@
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static AOM_INLINE void copy_frame_prob_info(AV1_COMP *cpi) {
-  FrameProbInfo *const frame_probs = &cpi->frame_probs;
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
   if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
     av1_copy(frame_probs->tx_type_probs, default_tx_type_probs);
   }
@@ -875,6 +871,15 @@
   }
 }
 
+static AOM_INLINE void restore_cdef_coding_context(CdefInfo *const dst,
+                                                   const CdefInfo *const src) {
+  dst->cdef_bits = src->cdef_bits;
+  dst->cdef_damping = src->cdef_damping;
+  av1_copy(dst->cdef_strengths, src->cdef_strengths);
+  av1_copy(dst->cdef_uv_strengths, src->cdef_uv_strengths);
+  dst->nb_cdef_strengths = src->nb_cdef_strengths;
+}
+
 // Coding context that only needs to be restored when recode loop includes
 // filtering (deblocking, CDEF, superres post-encode upscale and/or loop
 // restoraton).
@@ -882,9 +887,9 @@
   CODING_CONTEXT *const cc = &cpi->coding_context;
   AV1_COMMON *cm = &cpi->common;
   cm->lf = cc->lf;
-  cm->cdef_info = cc->cdef_info;
+  restore_cdef_coding_context(&cm->cdef_info, &cc->cdef_info);
   cpi->rc = cc->rc;
-  cpi->mv_stats = cc->mv_stats;
+  cpi->ppi->mv_stats = cc->mv_stats;
 }
 
 static AOM_INLINE int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
@@ -964,6 +969,8 @@
   }
 }
 
+void av1_update_film_grain_parameters_seq(struct AV1_PRIMARY *ppi,
+                                          const AV1EncoderConfig *oxcf);
 void av1_update_film_grain_parameters(struct AV1_COMP *cpi,
                                       const AV1EncoderConfig *oxcf);
 
@@ -972,7 +979,8 @@
 
 void av1_setup_frame(AV1_COMP *cpi);
 
-BLOCK_SIZE av1_select_sb_size(const AV1_COMP *const cpi);
+BLOCK_SIZE av1_select_sb_size(const AV1EncoderConfig *const oxcf, int width,
+                              int height, int number_spatial_layers);
 
 void av1_apply_active_map(AV1_COMP *cpi);
 

diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index 7b0b281..3fcfbe2 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c

@@ -26,15 +26,17 @@
 void av1_alloc_txb_buf(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool;
-  int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
-             ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
+  int size = ((cm->mi_params.mi_rows >> cm->seq_params->mib_size_log2) + 1) *
+             ((cm->mi_params.mi_cols >> cm->seq_params->mib_size_log2) + 1);
   const int num_planes = av1_num_planes(cm);
-  const int subsampling_x = cm->seq_params.subsampling_x;
-  const int subsampling_y = cm->seq_params.subsampling_y;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
+  const int luma_max_sb_square =
+      1 << num_pels_log2_lookup[cm->seq_params->sb_size];
   const int chroma_max_sb_square =
-      MAX_SB_SQUARE >> (subsampling_x + subsampling_y);
+      luma_max_sb_square >> (subsampling_x + subsampling_y);
   const int num_tcoeffs =
-      size * (MAX_SB_SQUARE + (num_planes - 1) * chroma_max_sb_square);
+      size * (luma_max_sb_square + (num_planes - 1) * chroma_max_sb_square);
   const int txb_unit_size = TX_SIZE_W_MIN * TX_SIZE_H_MIN;
 
   av1_free_txb_buf(cpi);
@@ -54,7 +56,7 @@
   for (int i = 0; i < size; i++) {
     for (int plane = 0; plane < num_planes; plane++) {
       const int max_sb_square =
-          (plane == AOM_PLANE_Y) ? MAX_SB_SQUARE : chroma_max_sb_square;
+          (plane == AOM_PLANE_Y) ? luma_max_sb_square : chroma_max_sb_square;
       cpi->coeff_buffer_base[i].tcoeff[plane] = tcoeff_ptr;
       cpi->coeff_buffer_base[i].eobs[plane] = eob_ptr;
       cpi->coeff_buffer_base[i].entropy_ctx[plane] = entropy_ctx_ptr;
@@ -487,10 +489,13 @@
           PLANE_TYPE_Y, xd, tx_size, cpi->use_screen_content_tools);
       (void)default_type;
       // TODO(kyslov): We don't always respect use_intra_default_tx_only flag in
-      // NonRD case. Specifically we ignore it in hybrid inta mode search and
-      // when picking up intra mode in nonRD inter mode search. We need to fix
-      // it in these two places. Meanwhile relieving the assert.
-      assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode);
+      // NonRD and REALTIME case. Specifically we ignore it in hybrid inta mode
+      // search, when picking up intra mode in nonRD inter mode search and in RD
+      // REALTIME mode when we limit TX type usage.
+      // We need to fix txfm cfg for these cases. Meanwhile relieving the
+      // assert.
+      assert(tx_type == default_type || cpi->sf.rt_sf.use_nonrd_pick_mode ||
+             cpi->oxcf.mode == REALTIME);
     }
   }
 
@@ -624,6 +629,7 @@
       const int coeff_ctx = coeff_contexts[pos];
       const tran_low_t v = qcoeff[pos];
       const tran_low_t level = abs(v);
+      td->abs_sum_level += level;
 
       if (allow_update_cdf) {
         if (c == eob - 1) {
@@ -719,7 +725,7 @@
 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
                                          int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int mib_size_log2 = cm->seq_params.mib_size_log2;
+  const int mib_size_log2 = cm->seq_params->mib_size_log2;
   const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
   const int offset =
       (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 0d5d383..c7a07d2 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c

@@ -10,7 +10,9 @@
  */
 
 #include "av1/common/warped_motion.h"
+#include "av1/common/thread_common.h"
 
+#include "av1/encoder/bitstream.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/encoder_alloc.h"
@@ -53,7 +55,7 @@
 static AOM_INLINE void update_delta_lf_for_row_mt(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
-  const int mib_size = cm->seq_params.mib_size;
+  const int mib_size = cm->seq_params->mib_size;
   const int frame_lf_count =
       av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
   for (int row = 0; row < cm->tiles.rows; row++) {
@@ -69,7 +71,8 @@
           const int idx_str = cm->mi_params.mi_stride * mi_row + mi_col;
           MB_MODE_INFO **mi = cm->mi_params.mi_grid_base + idx_str;
           MB_MODE_INFO *mbmi = mi[0];
-          if (mbmi->skip_txfm == 1 && (mbmi->bsize == cm->seq_params.sb_size)) {
+          if (mbmi->skip_txfm == 1 &&
+              (mbmi->bsize == cm->seq_params->sb_size)) {
             for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
               mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
             mbmi->delta_lf_from_base = xd->delta_lf_from_base;
@@ -363,7 +366,7 @@
     *cur_tile_id = tile_id;
     const int unit_height = mi_size_high[fp_block_size];
     get_next_job(&tile_data[tile_id], current_mi_row,
-                 is_firstpass ? unit_height : cm->seq_params.mib_size);
+                 is_firstpass ? unit_height : cm->seq_params->mib_size);
   }
 }
 
@@ -442,13 +445,20 @@
 
   const BLOCK_SIZE fp_block_size = cpi->fp_block_size;
   int end_of_frame = 0;
+
+  // When master thread does not have a valid job to process, xd->tile_ctx
+  // is not set and it contains NULL pointer. This can result in NULL pointer
+  // access violation if accessed beyond the encode stage. Hence, updating
+  // thread_data->td->mb.e_mbd.tile_ctx is initialized with common frame
+  // context to avoid NULL pointer access in subsequent stages.
+  thread_data->td->mb.e_mbd.tile_ctx = cm->fc;
   while (1) {
     int current_mi_row = -1;
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
     if (!get_next_job(&cpi->tile_data[cur_tile_id], &current_mi_row,
-                      cm->seq_params.mib_size)) {
+                      cm->seq_params->mib_size)) {
       // No jobs are available for the current tile. Query for the status of
       // other tiles and get the next job if available
       switch_tile_and_get_next_job(cm, cpi->tile_data, &cur_tile_id,
@@ -471,6 +481,7 @@
 
     td->mb.e_mbd.tile_ctx = td->tctx;
     td->mb.tile_pb_ctx = &this_tile->tctx;
+    td->abs_sum_level = 0;
 
     if (this_tile->allow_update_cdf) {
       td->mb.row_ctx = this_tile->row_ctx;
@@ -483,7 +494,7 @@
     av1_init_above_context(&cm->above_contexts, av1_num_planes(cm), tile_row,
                            &td->mb.e_mbd);
 
-    cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+    cfl_init(&td->mb.e_mbd.cfl, cm->seq_params);
     if (td->mb.txfm_search_info.txb_rd_records != NULL) {
       av1_crc32c_calculator_init(
           &td->mb.txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator);
@@ -493,6 +504,7 @@
 #if CONFIG_MULTITHREAD
     pthread_mutex_lock(enc_row_mt_mutex_);
 #endif
+    this_tile->abs_sum_level += td->abs_sum_level;
     row_mt_sync->num_threads_working--;
 #if CONFIG_MULTITHREAD
     pthread_mutex_unlock(enc_row_mt_mutex_);
@@ -527,16 +539,56 @@
   return 1;
 }
 
-void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *const cm = &cpi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi) {
+  cpi->mt_info.workers = ppi->p_mt_info.workers;
+  cpi->mt_info.num_workers = ppi->p_mt_info.num_workers;
+  cpi->mt_info.tile_thr_data = ppi->p_mt_info.tile_thr_data;
+  int i;
+  for (i = MOD_FP; i < NUM_MT_MODULES; i++) {
+    cpi->mt_info.num_mod_workers[i] =
+        AOMMIN(cpi->mt_info.num_workers, ppi->p_mt_info.num_mod_workers[i]);
+  }
+}
 
-  assert(mt_info->workers != NULL);
-  assert(mt_info->tile_thr_data != NULL);
+void av1_init_cdef_worker(AV1_COMP *cpi) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // The allocation is done only for level 0 parallel frames. No change
+  // in config is supported in the middle of a parallel encode set, since the
+  // rest of the MT modules also do not support dynamic change of config.
+  if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) return;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+  PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+  int num_cdef_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_CDEF);
+
+  av1_alloc_cdef_buffers(&cpi->common, &p_mt_info->cdef_worker,
+                         &cpi->mt_info.cdef_sync, num_cdef_workers, 1);
+  cpi->mt_info.cdef_worker = &p_mt_info->cdef_worker[0];
+}
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  AV1LrSync *lr_sync = &cpi->mt_info.lr_row_sync;
+  if (lr_sync->sync_range) {
+    int num_lr_workers =
+        av1_get_num_mod_workers_for_alloc(&cpi->ppi->p_mt_info, MOD_LR);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+      return;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    lr_sync->lrworkerdata[num_lr_workers - 1].rst_tmpbuf = cm->rst_tmpbuf;
+    lr_sync->lrworkerdata[num_lr_workers - 1].rlbs = cm->rlbs;
+  }
+}
+#endif
 
 #if CONFIG_MULTITHREAD
-  if (cpi->oxcf.row_mt == 1) {
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass) {
+  AV1_COMMON *const cm = &cpi->common;
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  // Initialize enc row MT object.
+  if (is_first_pass || cpi->oxcf.row_mt == 1) {
     AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
     if (enc_row_mt->mutex_ == NULL) {
       CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
@@ -544,203 +596,489 @@
       if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
     }
   }
-  AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
-  if (gm_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, gm_sync->mutex_,
-                    aom_malloc(sizeof(*(gm_sync->mutex_))));
-    if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
-  }
-  AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
-  if (tf_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, tf_sync->mutex_, aom_malloc(sizeof(*tf_sync->mutex_)));
-    if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
-  }
-  AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
-  if (cdef_sync->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
-                    aom_malloc(sizeof(*(cdef_sync->mutex_))));
-    if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
-  }
+
+  if (!is_first_pass) {
+    // Initialize global motion MT object.
+    AV1GlobalMotionSync *gm_sync = &mt_info->gm_sync;
+    if (gm_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, gm_sync->mutex_,
+                      aom_malloc(sizeof(*(gm_sync->mutex_))));
+      if (gm_sync->mutex_) pthread_mutex_init(gm_sync->mutex_, NULL);
+    }
+#if !CONFIG_REALTIME_ONLY
+    // Initialize temporal filtering MT object.
+    AV1TemporalFilterSync *tf_sync = &mt_info->tf_sync;
+    if (tf_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, tf_sync->mutex_,
+                      aom_malloc(sizeof(*tf_sync->mutex_)));
+      if (tf_sync->mutex_) pthread_mutex_init(tf_sync->mutex_, NULL);
+    }
+#endif  // !CONFIG_REALTIME_ONLY
+        // Initialize CDEF MT object.
+    AV1CdefSync *cdef_sync = &mt_info->cdef_sync;
+    if (cdef_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, cdef_sync->mutex_,
+                      aom_malloc(sizeof(*(cdef_sync->mutex_))));
+      if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+    }
+
+    // Initialize loop filter MT object.
+    AV1LfSync *lf_sync = &mt_info->lf_row_sync;
+    // Number of superblock rows
+    const int sb_rows =
+        ALIGN_POWER_OF_TWO(cm->height >> MI_SIZE_LOG2, MAX_MIB_SIZE_LOG2) >>
+        MAX_MIB_SIZE_LOG2;
+    PrimaryMultiThreadInfo *const p_mt_info = &cpi->ppi->p_mt_info;
+    int num_lf_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LPF);
+
+    if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+        num_lf_workers > lf_sync->num_workers) {
+      av1_loop_filter_dealloc(lf_sync);
+      av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_lf_workers);
+    }
+
+#if !CONFIG_REALTIME_ONLY
+    // Initialize loop restoration MT object.
+    AV1LrSync *lr_sync = &mt_info->lr_row_sync;
+    int rst_unit_size;
+    if (cm->width * cm->height > 352 * 288)
+      rst_unit_size = RESTORATION_UNITSIZE_MAX;
+    else
+      rst_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+    int num_rows_lr = av1_lr_count_units_in_tile(rst_unit_size, cm->height);
+    int num_lr_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_LR);
+    if (!lr_sync->sync_range || num_rows_lr > lr_sync->rows ||
+        num_lr_workers > lr_sync->num_workers ||
+        MAX_MB_PLANE > lr_sync->num_planes) {
+      av1_loop_restoration_dealloc(lr_sync, num_lr_workers);
+      av1_loop_restoration_alloc(lr_sync, cm, num_lr_workers, num_rows_lr,
+                                 MAX_MB_PLANE, cm->width);
+    }
 #endif
 
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &mt_info->workers[i];
-    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    // Initialization of pack bitstream MT object.
+    AV1EncPackBSSync *pack_bs_sync = &mt_info->pack_bs_sync;
+    if (pack_bs_sync->mutex_ == NULL) {
+      CHECK_MEM_ERROR(cm, pack_bs_sync->mutex_,
+                      aom_malloc(sizeof(*pack_bs_sync->mutex_)));
+      if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL);
+    }
+  }
+}
+#endif  // CONFIG_MULTITHREAD
 
-    thread_data->cpi = cpi;
-    thread_data->thread_id = i;
-    // Set the starting tile for each thread.
-    thread_data->start = i;
+// Computes the number of workers to be considered while allocating memory for a
+// multi-threaded module under FPMT.
+int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name) {
+  int num_mod_workers = p_mt_info->num_mod_workers[mod_name];
+  if (p_mt_info->num_mod_workers[MOD_FRAME_ENC] > 1) {
+    // TODO(anyone): Change num_mod_workers to num_mod_workers[MOD_FRAME_ENC].
+    // As frame parallel jobs will only perform multi-threading for the encode
+    // stage, we can limit the allocations according to num_enc_workers per
+    // frame parallel encode(a.k.a num_mod_workers[MOD_FRAME_ENC]).
+    num_mod_workers = p_mt_info->num_workers;
+  }
+  return num_mod_workers;
+}
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+
+  assert(p_mt_info->workers != NULL);
+  assert(p_mt_info->tile_thr_data != NULL);
+
+  int num_workers = p_mt_info->num_workers;
+  int num_enc_workers = av1_get_num_mod_workers_for_alloc(p_mt_info, MOD_ENC);
+  for (int i = num_workers - 1; i >= 0; i--) {
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
 
     if (i > 0) {
-      // alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
+      // Allocate thread data.
+      AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td,
+                          aom_memalign(32, sizeof(*thread_data->td)));
+      av1_zero(*thread_data->td);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      thread_data->original_td = thread_data->td;
+#endif
 
-      // Create threads
-      if (!winterface->reset(worker))
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                           "Tile encoder thread creation failed");
-    } else {
-      // Main thread acts as a worker and uses the thread data in cpi.
-      thread_data->td = &cpi->td;
+      // Set up shared coeff buffers.
+      av1_setup_shared_coeff_buffer(
+          &ppi->seq_params, &thread_data->td->shared_coeff_buf, &ppi->error);
+      AOM_CHECK_MEM_ERROR(
+          &ppi->error, thread_data->td->tmp_conv_dst,
+          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+                               sizeof(*thread_data->td->tmp_conv_dst)));
+
+      if (i < p_mt_info->num_mod_workers[MOD_FP]) {
+        // Set up firstpass PICK_MODE_CONTEXT.
+        thread_data->td->firstpass_ctx = av1_alloc_pmc(
+            ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
+      }
+
+      if (!is_first_pass && i < num_enc_workers) {
+        // Set up sms_tree.
+        av1_setup_sms_tree(ppi->cpi, thread_data->td);
+
+        alloc_obmc_buffers(&thread_data->td->obmc_buffer, &ppi->error);
+
+        for (int x = 0; x < 2; x++)
+          for (int y = 0; y < 2; y++)
+            AOM_CHECK_MEM_ERROR(
+                &ppi->error, thread_data->td->hash_value_buffer[x][y],
+                (uint32_t *)aom_malloc(
+                    AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+                    sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+        // Allocate frame counters in thread data.
+        AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td->counts,
+                            aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Allocate buffers used by palette coding mode.
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, thread_data->td->palette_buffer,
+            aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+        alloc_compound_type_rd_buffers(&ppi->error,
+                                       &thread_data->td->comp_rd_buffer);
+
+        for (int j = 0; j < 2; ++j) {
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->tmp_pred_bufs[j],
+              aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+                                   sizeof(*thread_data->td->tmp_pred_bufs[j])));
+        }
+
+        const SPEED_FEATURES *sf = &ppi->cpi->sf;
+        if (sf->intra_sf.intra_pruning_with_hog ||
+            sf->intra_sf.chroma_intra_pruning_with_hog) {
+          const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome;
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->pixel_gradient_info,
+              aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) *
+                         plane_types * MAX_SB_SQUARE));
+        }
+
+        if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
+          const int num_64x64_blocks =
+              (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
+          AOM_CHECK_MEM_ERROR(
+              &ppi->error, thread_data->td->vt64x64,
+              aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
+        }
+      }
     }
-    winterface->sync(worker);
+
+    if (!is_first_pass && ppi->cpi->oxcf.row_mt == 1 && i < num_enc_workers) {
+      if (i == 0) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        for (int j = 0; j < ppi->num_fp_contexts; j++) {
+          AOM_CHECK_MEM_ERROR(&ppi->error, ppi->parallel_cpi[j]->td.tctx,
+                              (FRAME_CONTEXT *)aom_memalign(
+                                  16, sizeof(*ppi->parallel_cpi[j]->td.tctx)));
+        }
+#else
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, ppi->cpi->td.tctx,
+            (FRAME_CONTEXT *)aom_memalign(16, sizeof(*ppi->cpi->td.tctx)));
+#endif
+      } else {
+        AOM_CHECK_MEM_ERROR(
+            &ppi->error, thread_data->td->tctx,
+            (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
+      }
+    }
   }
 }
 
-static AOM_INLINE void create_enc_workers(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *const cm = &cpi->common;
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
-
-  assert(mt_info->workers != NULL);
-  assert(mt_info->tile_thr_data != NULL);
-
-  for (int i = num_workers - 1; i >= 0; i--) {
-    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
-
-    if (i > 0) {
-      // Set up sms_tree.
-      av1_setup_sms_tree(cpi, thread_data->td);
-
-      alloc_obmc_buffers(&thread_data->td->obmc_buffer, cm);
-
-      CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
-                      (InterModesInfo *)aom_malloc(
-                          sizeof(*thread_data->td->inter_modes_info)));
-
-      for (int x = 0; x < 2; x++)
-        for (int y = 0; y < 2; y++)
-          CHECK_MEM_ERROR(
-              cm, thread_data->td->hash_value_buffer[x][y],
-              (uint32_t *)aom_malloc(
-                  AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
-                  sizeof(*thread_data->td->hash_value_buffer[0][0])));
-
-      // Allocate frame counters in thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                      aom_calloc(1, sizeof(*thread_data->td->counts)));
-
-      // Allocate buffers used by palette coding mode.
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->palette_buffer,
-          aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
-
-      alloc_compound_type_rd_buffers(cm, &thread_data->td->comp_rd_buffer);
-
-      for (int j = 0; j < 2; ++j) {
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->tmp_pred_bufs[j],
-            aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
-                                 sizeof(*thread_data->td->tmp_pred_bufs[j])));
-      }
-
-      if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) {
-        const int num_64x64_blocks =
-            (cm->seq_params.sb_size == BLOCK_64X64) ? 1 : 4;
-        CHECK_MEM_ERROR(
-            cm, thread_data->td->vt64x64,
-            aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks));
-      }
-    } else {
-      thread_data->td = &cpi->td;
-    }
-    if (cpi->oxcf.row_mt == 1)
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tctx,
-          (FRAME_CONTEXT *)aom_memalign(16, sizeof(*thread_data->td->tctx)));
-  }
-  mt_info->enc_mt_buf_init_done = 1;
-}
-
-void av1_create_workers(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *const cm = &cpi->common;
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers) {
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 
-  CHECK_MEM_ERROR(cm, mt_info->workers,
-                  aom_malloc(num_workers * sizeof(*mt_info->workers)));
+  AOM_CHECK_MEM_ERROR(&ppi->error, p_mt_info->workers,
+                      aom_malloc(num_workers * sizeof(*p_mt_info->workers)));
 
-  CHECK_MEM_ERROR(cm, mt_info->tile_thr_data,
-                  aom_calloc(num_workers, sizeof(*mt_info->tile_thr_data)));
+  AOM_CHECK_MEM_ERROR(
+      &ppi->error, p_mt_info->tile_thr_data,
+      aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data)));
 
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &mt_info->workers[i];
-    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    AVxWorker *const worker = &p_mt_info->workers[i];
+    EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i];
 
     winterface->init(worker);
     worker->thread_name = "aom enc worker";
 
-    if (i > 0) {
-      // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
-                      aom_memalign(32, sizeof(*thread_data->td)));
-      av1_zero(*thread_data->td);
-
-      // Set up shared coeff buffers.
-      av1_setup_shared_coeff_buffer(cm, &thread_data->td->shared_coeff_buf);
-      CHECK_MEM_ERROR(
-          cm, thread_data->td->tmp_conv_dst,
-          aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
-                               sizeof(*thread_data->td->tmp_conv_dst)));
-    }
-    ++mt_info->num_workers;
-  }
-}
-
-#if !CONFIG_REALTIME_ONLY
-static AOM_INLINE void fp_create_enc_workers(AV1_COMP *cpi, int num_workers) {
-  AV1_COMMON *const cm = &cpi->common;
-  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  MultiThreadInfo *const mt_info = &cpi->mt_info;
-  // For single-pass encode, threads are already created during call to
-  // av1_create_second_pass_workers(). Create threads only in the case of
-  // pass = 1.
-  const int create_workers = (mt_info->num_mod_workers[MOD_FP] == 0) ? 1 : 0;
-
-  assert(mt_info->workers != NULL);
-  assert(mt_info->tile_thr_data != NULL);
-
-#if CONFIG_MULTITHREAD
-  AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt;
-  if (enc_row_mt->mutex_ == NULL) {
-    CHECK_MEM_ERROR(cm, enc_row_mt->mutex_,
-                    aom_malloc(sizeof(*(enc_row_mt->mutex_))));
-    if (enc_row_mt->mutex_) pthread_mutex_init(enc_row_mt->mutex_, NULL);
-  }
-#endif
-
-  for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &mt_info->workers[i];
-    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
-
-    thread_data->cpi = cpi;
     thread_data->thread_id = i;
     // Set the starting tile for each thread.
     thread_data->start = i;
 
     if (i > 0) {
-      // Set up firstpass PICK_MODE_CONTEXT.
-      thread_data->td->firstpass_ctx =
-          av1_alloc_pmc(cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf);
-
-      if (create_workers) {
-        // Create threads
-        if (!winterface->reset(worker))
-          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      }
-    } else {
-      // Main thread acts as a worker and uses the thread data in cpi.
-      thread_data->td = &cpi->td;
+      // Create threads
+      if (!winterface->reset(worker))
+        aom_internal_error(&ppi->error, AOM_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
     }
-    if (create_workers) {
-      winterface->sync(worker);
-      ++mt_info->num_mod_workers[MOD_FP];
+    winterface->sync(worker);
+
+    ++p_mt_info->num_workers;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// This function returns 1 if frame parallel encode is supported for
+// the current configuration. Returns 0 otherwise.
+static AOM_INLINE int is_fpmt_config(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  // FPMT is enabled for AOM_Q and AOM_VBR.
+  // TODO(Mufaddal, Aasaipriya): Test and enable multi-tile and resize config.
+  if (oxcf->rc_cfg.mode == AOM_CBR || oxcf->rc_cfg.mode == AOM_CQ) {
+    return 0;
+  }
+  if (ppi->use_svc) {
+    return 0;
+  }
+  if (oxcf->tile_cfg.enable_large_scale_tile) {
+    return 0;
+  }
+  if (oxcf->dec_model_cfg.timing_info_present) {
+    return 0;
+  }
+  if (oxcf->mode != GOOD) {
+    return 0;
+  }
+  if (oxcf->tool_cfg.error_resilient_mode) {
+    return 0;
+  }
+  if (oxcf->resize_cfg.resize_mode) {
+    return 0;
+  }
+  if (oxcf->passes == 1) {
+    return 0;
+  }
+
+  return 1;
+}
+
+// A large value for threads used to compute the max num_enc_workers
+// possible for each resolution.
+#define MAX_THREADS 100
+
+// Computes the number of frame parallel(fp) contexts to be created
+// based on the number of max_enc_workers.
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf) {
+  ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] = 0;
+  if (!is_fpmt_config(ppi, oxcf)) {
+    return 1;
+  }
+  int max_num_enc_workers =
+      av1_compute_num_enc_workers(ppi->parallel_cpi[0], MAX_THREADS);
+
+  // A parallel frame encode must have at least 1/4th the theoretical limit of
+  // max enc workers. TODO(Remya) : Tune this value for multi-tile case.
+  int workers_per_frame = AOMMAX(1, (max_num_enc_workers + 2) / 4);
+  int max_threads = oxcf->max_threads;
+  int num_fp_contexts = max_threads / workers_per_frame;
+
+  num_fp_contexts = AOMMAX(1, AOMMIN(num_fp_contexts, MAX_PARALLEL_FRAMES));
+  if (num_fp_contexts > 1) {
+    assert(max_threads >= 2);
+    ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC] =
+        AOMMIN(max_num_enc_workers * num_fp_contexts, oxcf->max_threads);
+  }
+  return num_fp_contexts;
+}
+
+// Computes the number of workers to process each of the parallel frames.
+static AOM_INLINE int compute_num_workers_per_frame(
+    const int num_workers, const int parallel_frame_count) {
+  // Number of level 2 workers per frame context (floor division).
+  int workers_per_frame = (num_workers / parallel_frame_count);
+  return workers_per_frame;
+}
+
+// Prepare level 1 workers. This function is only called for
+// parallel_frame_count > 1. This function populates the mt_info structure of
+// frame level contexts appropriately by dividing the total number of available
+// workers amongst the frames as level 2 workers. It also populates the hook and
+// data members of level 1 workers.
+static AOM_INLINE void prepare_fpmt_workers(AV1_PRIMARY *ppi,
+                                            AV1_COMP_DATA *first_cpi_data,
+                                            AVxWorkerHook hook,
+                                            int parallel_frame_count) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  int num_workers = p_mt_info->num_workers;
+
+  int frame_idx = 0;
+  int i = 0;
+  while (i < num_workers) {
+    // Assign level 1 worker
+    AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] =
+        &p_mt_info->workers[i];
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    const int num_planes = av1_num_planes(&cur_cpi->common);
+
+    // Assign start of level 2 worker pool
+    mt_info->workers = &p_mt_info->workers[i];
+    mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i];
+    // Assign number of workers for each frame in the parallel encode set.
+    mt_info->num_workers = compute_num_workers_per_frame(
+        num_workers - i, parallel_frame_count - frame_idx);
+    for (int j = MOD_FP; j < NUM_MT_MODULES; j++) {
+      mt_info->num_mod_workers[j] =
+          AOMMIN(mt_info->num_workers, ppi->p_mt_info.num_mod_workers[j]);
+    }
+    if (ppi->p_mt_info.cdef_worker != NULL) {
+      mt_info->cdef_worker = &ppi->p_mt_info.cdef_worker[i];
+
+      // Back up the original cdef_worker pointers.
+      mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf;
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->restore_state_buf.cdef_colbuf[plane] =
+            mt_info->cdef_worker->colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    // Back up the original LR buffers before update.
+    int idx = i + mt_info->num_workers - 1;
+    mt_info->restore_state_buf.rst_tmpbuf =
+        mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf;
+    mt_info->restore_state_buf.rlbs =
+        mt_info->lr_row_sync.lrworkerdata[idx].rlbs;
+
+    // Update LR buffers.
+    mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+        cur_cpi->common.rst_tmpbuf;
+    mt_info->lr_row_sync.lrworkerdata[idx].rlbs = cur_cpi->common.rlbs;
+#endif
+
+    // At this stage, the thread specific CDEF buffers for the current frame's
+    // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has
+    // already been allocated across parallel frames.
+    av1_alloc_cdef_buffers(&cur_cpi->common, &p_mt_info->cdef_worker,
+                           &mt_info->cdef_sync, p_mt_info->num_workers, 0);
+
+    frame_worker->hook = hook;
+    frame_worker->data1 = cur_cpi;
+    frame_worker->data2 = (frame_idx == 0)
+                              ? first_cpi_data
+                              : &ppi->parallel_frames_data[frame_idx - 1];
+    frame_idx++;
+    i += mt_info->num_workers;
+  }
+  p_mt_info->p_num_workers = parallel_frame_count;
+}
+
+// Launch level 1 workers to perform frame parallel encode.
+static AOM_INLINE void launch_fpmt_workers(AV1_PRIMARY *ppi) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (i == 0)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+}
+
+// Synchronize level 1 workers.
+static AOM_INLINE void sync_fpmt_workers(AV1_PRIMARY *ppi) {
+  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+  int num_workers = ppi->p_mt_info.p_num_workers;
+  int had_error = 0;
+  // Points to error in the earliest display order frame in the parallel set.
+  const struct aom_internal_error_info *error;
+
+  // Encoding ends.
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *const worker = ppi->p_mt_info.p_workers[i];
+    if (!winterface->sync(worker)) {
+      had_error = 1;
+      error = ((AV1_COMP *)worker->data1)->common.error;
     }
   }
-  mt_info->fp_mt_buf_init_done = 1;
+
+  if (had_error)
+    aom_internal_error(&ppi->error, error->error_code, error->detail);
 }
+
+// Restore worker states after parallel encode.
+static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi,
+                                                  int parallel_frame_count) {
+  assert(parallel_frame_count <= ppi->num_fp_contexts &&
+         parallel_frame_count > 1);
+  (void)parallel_frame_count;
+
+  PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info;
+  int num_workers = p_mt_info->num_workers;
+
+  int frame_idx = 0;
+  int i = 0;
+  while (i < num_workers) {
+    AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx];
+    MultiThreadInfo *mt_info = &cur_cpi->mt_info;
+    const int num_planes = av1_num_planes(&cur_cpi->common);
+
+    // Restore the original cdef_worker pointers.
+    if (ppi->p_mt_info.cdef_worker != NULL) {
+      mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf;
+      for (int plane = 0; plane < num_planes; plane++)
+        mt_info->cdef_worker->colbuf[plane] =
+            mt_info->restore_state_buf.cdef_colbuf[plane];
+    }
+#if !CONFIG_REALTIME_ONLY
+    // Restore the original LR buffers.
+    int idx = i + mt_info->num_workers - 1;
+    mt_info->lr_row_sync.lrworkerdata[idx].rst_tmpbuf =
+        mt_info->restore_state_buf.rst_tmpbuf;
+    mt_info->lr_row_sync.lrworkerdata[idx].rlbs =
+        mt_info->restore_state_buf.rlbs;
 #endif
 
+    frame_idx++;
+    i += mt_info->num_workers;
+  }
+}
+
+static int get_compressed_data_hook(void *arg1, void *arg2) {
+  AV1_COMP *cpi = (AV1_COMP *)arg1;
+  AV1_COMP_DATA *cpi_data = (AV1_COMP_DATA *)arg2;
+  int status = av1_get_compressed_data(cpi, cpi_data);
+
+  // AOM_CODEC_OK(0) means no error.
+  return !status;
+}
+
+// This function encodes the raw frame data for each frame in parallel encode
+// set, and outputs the frame bit stream to the designated buffers.
+int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                 AV1_COMP_DATA *const first_cpi_data) {
+  // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf
+  // corresponding to frames in the current parallel encode set.
+  int ref_buffers_used_map = 0;
+  int frames_in_parallel_set = av1_init_parallel_frame_context(
+      first_cpi_data, ppi, &ref_buffers_used_map);
+  prepare_fpmt_workers(ppi, first_cpi_data, get_compressed_data_hook,
+                       frames_in_parallel_set);
+  launch_fpmt_workers(ppi);
+  sync_fpmt_workers(ppi);
+  restore_workers_after_fpmt(ppi, frames_in_parallel_set);
+
+  // Release cpi->scaled_ref_buf corresponding to frames in the current parallel
+  // encode set.
+  for (int i = 0; i < frames_in_parallel_set; ++i) {
+    av1_release_scaled_references_fpmt(ppi->parallel_cpi[i]);
+  }
+  av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool,
+                                ref_buffers_used_map);
+  return AOM_CODEC_OK;
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
 static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info,
                                       int num_workers) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
@@ -759,13 +1097,13 @@
   int had_error = 0;
 
   // Encoding ends.
-  for (int i = num_workers - 1; i >= 0; i--) {
+  for (int i = num_workers - 1; i > 0; i--) {
     AVxWorker *const worker = &mt_info->workers[i];
     had_error |= !winterface->sync(worker);
   }
 
   if (had_error)
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to encode tile data");
 }
 
@@ -817,13 +1155,24 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
     thread_data->cpi = cpi;
     if (i == 0) {
       thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     thread_data->td->intrabc_used = 0;
     thread_data->td->deltaq_used = 0;
+    thread_data->td->abs_sum_level = 0;
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -831,7 +1180,6 @@
       thread_data->td->rd_counts = cpi->td.rd_counts;
       thread_data->td->mb.obmc_buffer = thread_data->td->obmc_buffer;
 
-      thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
       for (int x = 0; x < 2; x++) {
         for (int y = 0; y < 2; y++) {
           memcpy(thread_data->td->hash_value_buffer[x][y],
@@ -873,6 +1221,8 @@
         thread_data->td->mb.tmp_pred_bufs[j] =
             thread_data->td->tmp_pred_bufs[j];
       }
+      thread_data->td->mb.pixel_gradient_info =
+          thread_data->td->pixel_gradient_info;
 
       thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
       for (int j = 0; j < 2; ++j) {
@@ -896,10 +1246,20 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
     thread_data->cpi = cpi;
     if (i == 0) {
       thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -955,13 +1315,14 @@
 int av1_get_max_num_workers(AV1_COMP *cpi) {
   int max_num_workers = 0;
   for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
-    max_num_workers = AOMMAX(cpi->mt_info.num_mod_workers[i], max_num_workers);
+    max_num_workers =
+        AOMMAX(cpi->ppi->p_mt_info.num_mod_workers[i], max_num_workers);
   assert(max_num_workers >= 1);
   return AOMMIN(max_num_workers, cpi->oxcf.max_threads);
 }
 
 // Computes the number of workers for encoding stage (row/tile multi-threading)
-static AOM_INLINE int compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers) {
   if (max_workers <= 1) return 1;
   if (cpi->oxcf.row_mt)
     return compute_num_enc_row_mt_workers(&cpi->common, max_workers);
@@ -981,12 +1342,8 @@
   if (cpi->allocated_tiles < tile_cols * tile_rows) av1_alloc_tile_data(cpi);
 
   av1_init_tile_data(cpi);
-  // Only run once to create threads and allocate thread data.
-  if (mt_info->enc_mt_buf_init_done == 0) {
-    create_enc_workers(cpi, num_workers);
-  } else {
-    num_workers = AOMMIN(num_workers, mt_info->num_workers);
-  }
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
   prepare_enc_workers(cpi, enc_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
@@ -1122,12 +1479,8 @@
     }
   }
 
-  // Only run once to create threads and allocate thread data.
-  if (mt_info->enc_mt_buf_init_done == 0) {
-    create_enc_workers(cpi, num_workers);
-  } else {
-    num_workers = AOMMIN(num_workers, mt_info->num_workers);
-  }
+  num_workers = AOMMIN(num_workers, mt_info->num_workers);
+
   assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
                         num_workers);
   prepare_enc_workers(cpi, enc_row_mt_worker_hook, num_workers);
@@ -1192,9 +1545,6 @@
   }
 
   num_workers = AOMMIN(num_workers, mt_info->num_workers);
-  // Only run once to create threads and allocate thread data.
-  if (mt_info->fp_mt_buf_init_done == 0)
-    fp_create_enc_workers(cpi, num_workers);
   assign_tile_to_thread(thread_id_to_tile_id, tile_cols * tile_rows,
                         num_workers);
   fp_prepare_enc_workers(cpi, fp_enc_row_mt_worker_hook, num_workers);
@@ -1292,12 +1642,12 @@
   MACROBLOCKD *xd = &x->e_mbd;
   TplTxfmStats *tpl_txfm_stats = &thread_data->td->tpl_txfm_stats;
   CommonModeInfoParams *mi_params = &cm->mi_params;
-  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  BLOCK_SIZE bsize = convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   TX_SIZE tx_size = max_txsize_lookup[bsize];
   int mi_height = mi_size_high[bsize];
-  int num_active_workers = cpi->tpl_data.tpl_mt_sync.num_threads_working;
+  int num_active_workers = cpi->ppi->tpl_data.tpl_mt_sync.num_threads_working;
 
-  memset(tpl_txfm_stats, 0, sizeof(*tpl_txfm_stats));
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
 
   for (int mi_row = thread_data->start * mi_height; mi_row < mi_params->mi_rows;
        mi_row += num_active_workers * mi_height) {
@@ -1376,10 +1726,20 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
     thread_data->cpi = cpi;
     if (i == 0) {
       thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -1394,21 +1754,17 @@
 }
 
 // Accumulate transform stats after tpl.
-static void tpl_accumulate_txfm_stats(AV1_COMP *cpi, int num_workers) {
-  double *total_abs_coeff_sum = cpi->td.tpl_txfm_stats.abs_coeff_sum;
-  int *txfm_block_count = &cpi->td.tpl_txfm_stats.txfm_block_count;
-  TplParams *tpl_data = &cpi->tpl_data;
-  int coeff_num = tpl_data->tpl_frame[tpl_data->frame_idx].coeff_num;
+static void tpl_accumulate_txfm_stats(ThreadData *main_td,
+                                      const MultiThreadInfo *mt_info,
+                                      int num_workers) {
+  TplTxfmStats *accumulated_stats = &main_td->tpl_txfm_stats;
   for (int i = num_workers - 1; i >= 0; i--) {
-    AVxWorker *const worker = &cpi->mt_info.workers[i];
+    AVxWorker *const worker = &mt_info->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
     ThreadData *td = thread_data->td;
-    if (td != &cpi->td) {
-      TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
-      *txfm_block_count += tpl_txfm_stats->txfm_block_count;
-      for (int j = 0; j < coeff_num; j++) {
-        total_abs_coeff_sum[j] += tpl_txfm_stats->abs_coeff_sum[j];
-      }
+    if (td != main_td) {
+      const TplTxfmStats *tpl_txfm_stats = &td->tpl_txfm_stats;
+      av1_accumulate_tpl_txfm_stats(tpl_txfm_stats, accumulated_stats);
     }
   }
 }
@@ -1418,7 +1774,7 @@
   AV1_COMMON *cm = &cpi->common;
   CommonModeInfoParams *mi_params = &cm->mi_params;
   MultiThreadInfo *mt_info = &cpi->mt_info;
-  TplParams *tpl_data = &cpi->tpl_data;
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
   AV1TplRowMultiThreadSync *tpl_sync = &tpl_data->tpl_mt_sync;
   int mb_rows = mi_params->mb_rows;
   int num_workers =
@@ -1437,7 +1793,7 @@
   prepare_tpl_workers(cpi, tpl_worker_hook, num_workers);
   launch_workers(&cpi->mt_info, num_workers);
   sync_enc_workers(&cpi->mt_info, cm, num_workers);
-  tpl_accumulate_txfm_stats(cpi, num_workers);
+  tpl_accumulate_txfm_stats(&cpi->td, &cpi->mt_info, num_workers);
 }
 
 // Deallocate memory for temporal filter multi-thread synchronization.
@@ -1513,10 +1869,20 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
     thread_data->cpi = cpi;
     if (i == 0) {
       thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
     }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -1699,7 +2065,20 @@
     worker->data1 = thread_data;
     worker->data2 = NULL;
 
+    thread_data->thread_id = i;
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
     thread_data->cpi = cpi;
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   }
 }
 
@@ -1792,6 +2171,331 @@
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+// Compare and order tiles based on absolute sum of tx coeffs.
+static int compare_tile_order(const void *a, const void *b) {
+  const PackBSTileOrder *const tile_a = (const PackBSTileOrder *)a;
+  const PackBSTileOrder *const tile_b = (const PackBSTileOrder *)b;
+
+  if (tile_a->abs_sum_level > tile_b->abs_sum_level)
+    return -1;
+  else if (tile_a->abs_sum_level == tile_b->abs_sum_level)
+    return (tile_a->tile_idx > tile_b->tile_idx ? 1 : -1);
+  else
+    return 1;
+}
+
+// Get next tile index to be processed for pack bitstream
+static AOM_INLINE int get_next_pack_bs_tile_idx(
+    AV1EncPackBSSync *const pack_bs_sync, const int num_tiles) {
+  assert(pack_bs_sync->next_job_idx <= num_tiles);
+  if (pack_bs_sync->next_job_idx == num_tiles) return -1;
+
+  return pack_bs_sync->pack_bs_tile_order[pack_bs_sync->next_job_idx++]
+      .tile_idx;
+}
+
+// Calculates bitstream chunk size based on total buffer size and tile or tile
+// group size.
+static AOM_INLINE size_t get_bs_chunk_size(int tg_or_tile_size,
+                                           const int frame_or_tg_size,
+                                           size_t *remain_buf_size,
+                                           size_t max_buf_size,
+                                           int is_last_chunk) {
+  size_t this_chunk_size;
+  assert(*remain_buf_size > 0);
+  if (is_last_chunk) {
+    this_chunk_size = *remain_buf_size;
+    *remain_buf_size = 0;
+  } else {
+    const uint64_t size_scale = (uint64_t)max_buf_size * tg_or_tile_size;
+    this_chunk_size = (size_t)(size_scale / frame_or_tg_size);
+    *remain_buf_size -= this_chunk_size;
+    assert(*remain_buf_size > 0);
+  }
+  assert(this_chunk_size > 0);
+  return this_chunk_size;
+}
+
+// Initializes params required for pack bitstream tile.
+static void init_tile_pack_bs_params(AV1_COMP *const cpi, uint8_t *const dst,
+                                     struct aom_write_bit_buffer *saved_wb,
+                                     PackBSParams *const pack_bs_params_arr,
+                                     uint8_t obu_extn_header) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  const int num_tg_hdrs = cpi->num_tg;
+  // Tile group size in terms of number of tiles.
+  const int tg_size_in_tiles = (num_tiles + num_tg_hdrs - 1) / num_tg_hdrs;
+  uint8_t *tile_dst = dst;
+  uint8_t *tile_data_curr = dst;
+  // Max tile group count can not be more than MAX_TILES.
+  int tg_size_mi[MAX_TILES] = { 0 };  // Size of tile group in mi units
+  int tile_idx;
+  int tg_idx = 0;
+  int tile_count_in_tg = 0;
+  int new_tg = 1;
+
+  // Populate pack bitstream params of all tiles.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    const TileInfo *const tile_info = &cpi->tile_data[tile_idx].tile_info;
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    // Calculate tile size in mi units.
+    const int tile_size_mi = (tile_info->mi_col_end - tile_info->mi_col_start) *
+                             (tile_info->mi_row_end - tile_info->mi_row_start);
+    int is_last_tile_in_tg = 0;
+    tile_count_in_tg++;
+    if (tile_count_in_tg == tg_size_in_tiles || tile_idx == (num_tiles - 1))
+      is_last_tile_in_tg = 1;
+
+    // Populate pack bitstream params of this tile.
+    pack_bs_params->curr_tg_hdr_size = 0;
+    pack_bs_params->obu_extn_header = obu_extn_header;
+    pack_bs_params->saved_wb = saved_wb;
+    pack_bs_params->obu_header_size = 0;
+    pack_bs_params->is_last_tile_in_tg = is_last_tile_in_tg;
+    pack_bs_params->new_tg = new_tg;
+    pack_bs_params->tile_col = tile_info->tile_col;
+    pack_bs_params->tile_row = tile_info->tile_row;
+    pack_bs_params->tile_size_mi = tile_size_mi;
+    tg_size_mi[tg_idx] += tile_size_mi;
+
+    if (new_tg) new_tg = 0;
+    if (is_last_tile_in_tg) {
+      tile_count_in_tg = 0;
+      new_tg = 1;
+      tg_idx++;
+    }
+  }
+
+  assert(cpi->available_bs_size > 0);
+  size_t tg_buf_size[MAX_TILES] = { 0 };
+  size_t max_buf_size = cpi->available_bs_size;
+  size_t remain_buf_size = max_buf_size;
+  const int frame_size_mi = cm->mi_params.mi_rows * cm->mi_params.mi_cols;
+
+  tile_idx = 0;
+  // Prepare obu, tile group and frame header of each tile group.
+  for (tg_idx = 0; tg_idx < cpi->num_tg; tg_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    int is_last_tg = tg_idx == cpi->num_tg - 1;
+    // Prorate bitstream buffer size based on tile group size and available
+    // buffer size. This buffer will be used to store headers and tile data.
+    tg_buf_size[tg_idx] =
+        get_bs_chunk_size(tg_size_mi[tg_idx], frame_size_mi, &remain_buf_size,
+                          max_buf_size, is_last_tg);
+
+    pack_bs_params->dst = tile_dst;
+    pack_bs_params->tile_data_curr = tile_dst;
+
+    // Write obu, tile group and frame header at first tile in the tile
+    // group.
+    av1_write_obu_tg_tile_headers(cpi, xd, pack_bs_params, tile_idx);
+    tile_dst += tg_buf_size[tg_idx];
+
+    // Exclude headers from tile group buffer size.
+    tg_buf_size[tg_idx] -= pack_bs_params->curr_tg_hdr_size;
+    tile_idx += tg_size_in_tiles;
+  }
+
+  tg_idx = 0;
+  // Calculate bitstream buffer size of each tile in the tile group.
+  for (tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+
+    if (pack_bs_params->new_tg) {
+      max_buf_size = tg_buf_size[tg_idx];
+      remain_buf_size = max_buf_size;
+    }
+
+    // Prorate bitstream buffer size of this tile based on tile size and
+    // available buffer size. For this proration, header size is not accounted.
+    const size_t tile_buf_size = get_bs_chunk_size(
+        pack_bs_params->tile_size_mi, tg_size_mi[tg_idx], &remain_buf_size,
+        max_buf_size, pack_bs_params->is_last_tile_in_tg);
+    pack_bs_params->tile_buf_size = tile_buf_size;
+
+    // Update base address of bitstream buffer for tile and tile group.
+    if (pack_bs_params->new_tg) {
+      tile_dst = pack_bs_params->dst;
+      tile_data_curr = pack_bs_params->tile_data_curr;
+      // Account header size in first tile of a tile group.
+      pack_bs_params->tile_buf_size += pack_bs_params->curr_tg_hdr_size;
+    } else {
+      pack_bs_params->dst = tile_dst;
+      pack_bs_params->tile_data_curr = tile_data_curr;
+    }
+
+    if (pack_bs_params->is_last_tile_in_tg) tg_idx++;
+    tile_dst += pack_bs_params->tile_buf_size;
+  }
+}
+
+// Worker hook function of pack bitsteam multithreading.
+static int pack_bs_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  PackBSParams *const pack_bs_params = (PackBSParams *)arg2;
+  AV1_COMP *const cpi = thread_data->cpi;
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &cpi->mt_info.pack_bs_sync;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int num_tiles = tiles->cols * tiles->rows;
+
+  while (1) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(pack_bs_sync->mutex_);
+#endif
+    const int tile_idx = get_next_pack_bs_tile_idx(pack_bs_sync, num_tiles);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(pack_bs_sync->mutex_);
+#endif
+    if (tile_idx == -1) break;
+    TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+    thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+    av1_pack_tile_info(cpi, thread_data->td, &pack_bs_params[tile_idx]);
+  }
+
+  return 1;
+}
+
+// Prepares thread data and workers of pack bitsteam multithreading.
+static void prepare_pack_bs_workers(AV1_COMP *const cpi,
+                                    PackBSParams *const pack_bs_params,
+                                    AVxWorkerHook hook, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int i = num_workers - 1; i >= 0; i--) {
+    AVxWorker *worker = &mt_info->workers[i];
+    EncWorkerData *const thread_data = &mt_info->tile_thr_data[i];
+    if (i == 0) {
+      thread_data->td = &cpi->td;
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    }
+#else
+    } else {
+      thread_data->td = thread_data->original_td;
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+    if (thread_data->td != &cpi->td) thread_data->td->mb = cpi->td.mb;
+
+    thread_data->cpi = cpi;
+    thread_data->start = i;
+    thread_data->thread_id = i;
+    av1_reset_pack_bs_thread_data(thread_data->td);
+
+    worker->hook = hook;
+    worker->data1 = thread_data;
+    worker->data2 = pack_bs_params;
+  }
+
+  AV1_COMMON *const cm = &cpi->common;
+  AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync;
+  const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols;
+  pack_bs_sync->next_job_idx = 0;
+
+  PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order;
+  // Reset tile order data of pack bitstream
+  av1_zero_array(pack_bs_tile_order, num_tiles);
+
+  // Populate pack bitstream tile order structure
+  for (uint16_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+    pack_bs_tile_order[tile_idx].abs_sum_level =
+        cpi->tile_data[tile_idx].abs_sum_level;
+    pack_bs_tile_order[tile_idx].tile_idx = tile_idx;
+  }
+
+  // Sort tiles in descending order based on tile area.
+  qsort(pack_bs_tile_order, num_tiles, sizeof(*pack_bs_tile_order),
+        compare_tile_order);
+}
+
+// Accumulates data after pack bitsteam processing.
+static void accumulate_pack_bs_data(
+    AV1_COMP *const cpi, const PackBSParams *const pack_bs_params_arr,
+    uint8_t *const dst, uint32_t *total_size, const FrameHeaderInfo *fh_info,
+    int *const largest_tile_id, unsigned int *max_tile_size,
+    uint32_t *const obu_header_size, uint8_t **tile_data_start,
+    const int num_workers) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const CommonTileParams *const tiles = &cm->tiles;
+  const int tile_count = tiles->cols * tiles->rows;
+  // Fixed size tile groups for the moment
+  size_t curr_tg_data_size = 0;
+  int is_first_tg = 1;
+  uint8_t *curr_tg_start = dst;
+  size_t src_offset = 0;
+  size_t dst_offset = 0;
+
+  for (int tile_idx = 0; tile_idx < tile_count; tile_idx++) {
+    // PackBSParams stores all parameters required to pack tile and header
+    // info.
+    const PackBSParams *const pack_bs_params = &pack_bs_params_arr[tile_idx];
+    uint32_t tile_size = 0;
+
+    if (pack_bs_params->new_tg) {
+      curr_tg_start = dst + *total_size;
+      curr_tg_data_size = pack_bs_params->curr_tg_hdr_size;
+      *tile_data_start += pack_bs_params->curr_tg_hdr_size;
+      *obu_header_size = pack_bs_params->obu_header_size;
+    }
+    curr_tg_data_size +=
+        pack_bs_params->buf.size + (pack_bs_params->is_last_tile_in_tg ? 0 : 4);
+
+    if (pack_bs_params->buf.size > *max_tile_size) {
+      *largest_tile_id = tile_idx;
+      *max_tile_size = (unsigned int)pack_bs_params->buf.size;
+    }
+    tile_size +=
+        (uint32_t)pack_bs_params->buf.size + *pack_bs_params->total_size;
+
+    // Pack all the chunks of tile bitstreams together
+    if (tile_idx != 0) memmove(dst + dst_offset, dst + src_offset, tile_size);
+
+    if (pack_bs_params->is_last_tile_in_tg)
+      av1_write_last_tile_info(
+          cpi, fh_info, pack_bs_params->saved_wb, &curr_tg_data_size,
+          curr_tg_start, &tile_size, tile_data_start, largest_tile_id,
+          &is_first_tg, *obu_header_size, pack_bs_params->obu_extn_header);
+    src_offset += pack_bs_params->tile_buf_size;
+    dst_offset += tile_size;
+    *total_size += tile_size;
+  }
+
+  // Accumulate thread data
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+  for (int idx = num_workers - 1; idx >= 0; idx--) {
+    ThreadData const *td = mt_info->tile_thr_data[idx].td;
+    av1_accumulate_pack_bs_thread_data(cpi, td);
+  }
+}
+
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers) {
+  MultiThreadInfo *const mt_info = &cpi->mt_info;
+
+  PackBSParams pack_bs_params[MAX_TILES];
+  uint32_t tile_size[MAX_TILES] = { 0 };
+
+  for (int tile_idx = 0; tile_idx < MAX_TILES; tile_idx++)
+    pack_bs_params[tile_idx].total_size = &tile_size[tile_idx];
+
+  init_tile_pack_bs_params(cpi, dst, saved_wb, pack_bs_params, obu_extn_header);
+  prepare_pack_bs_workers(cpi, pack_bs_params, pack_bs_worker_hook,
+                          num_workers);
+  launch_workers(mt_info, num_workers);
+  sync_enc_workers(mt_info, &cpi->common, num_workers);
+  accumulate_pack_bs_data(cpi, pack_bs_params, dst, total_size, fh_info,
+                          largest_tile_id, max_tile_size, obu_header_size,
+                          tile_data_start, num_workers);
+}
+
 // Deallocate memory for CDEF search multi-thread synchronization.
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync) {
   (void)cdef_sync;
@@ -1820,6 +2524,9 @@
 
 // Initializes cdef_sync parameters.
 static AOM_INLINE void cdef_reset_job_info(AV1CdefSync *cdef_sync) {
+#if CONFIG_MULTITHREAD
+  if (cdef_sync->mutex_) pthread_mutex_init(cdef_sync->mutex_, NULL);
+#endif  // CONFIG_MULTITHREAD
   cdef_sync->end_of_frame = 0;
   cdef_sync->fbr = 0;
   cdef_sync->fbc = 0;
@@ -1904,8 +2611,8 @@
   // For single-pass encode, using no. of workers as per tf block size was not
   // found to improve speed. Hence the thread assignment for single-pass encode
   // is kept based on compute_num_enc_workers().
-  if (cpi->oxcf.pass != 2)
-    return (compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
+  if (cpi->oxcf.pass < AOM_RC_SECOND_PASS)
+    return (av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads));
 
   if (cpi->oxcf.max_threads <= 1) return 1;
 
@@ -1918,44 +2625,56 @@
 
 // Computes num_workers for tpl multi-threading.
 static AOM_INLINE int compute_num_tpl_workers(AV1_COMP *cpi) {
-  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop filter multi-threading.
 static AOM_INLINE int compute_num_lf_workers(AV1_COMP *cpi) {
-  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for cdef multi-threading.
 static AOM_INLINE int compute_num_cdef_workers(AV1_COMP *cpi) {
-  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
 }
 
 // Computes num_workers for loop-restoration multi-threading.
 static AOM_INLINE int compute_num_lr_workers(AV1_COMP *cpi) {
-  return compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+  return av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+}
+
+// Computes num_workers for pack bitstream multi-threading.
+static AOM_INLINE int compute_num_pack_bs_workers(AV1_COMP *cpi) {
+  if (cpi->oxcf.max_threads <= 1) return 1;
+  return compute_num_enc_tile_mt_workers(&cpi->common, cpi->oxcf.max_threads);
 }
 
 int compute_num_mod_workers(AV1_COMP *cpi, MULTI_THREADED_MODULES mod_name) {
   int num_mod_workers = 0;
   switch (mod_name) {
     case MOD_FP:
-      if (cpi->oxcf.pass == 2)
+      if (cpi->oxcf.pass >= AOM_RC_SECOND_PASS)
         num_mod_workers = 0;
       else
-        num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+        num_mod_workers =
+            av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
       break;
     case MOD_TF: num_mod_workers = compute_num_tf_workers(cpi); break;
     case MOD_TPL: num_mod_workers = compute_num_tpl_workers(cpi); break;
     case MOD_GME: num_mod_workers = 1; break;
     case MOD_ENC:
-      num_mod_workers = compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
+      num_mod_workers = av1_compute_num_enc_workers(cpi, cpi->oxcf.max_threads);
       break;
     case MOD_LPF: num_mod_workers = compute_num_lf_workers(cpi); break;
     case MOD_CDEF_SEARCH:
       num_mod_workers = compute_num_cdef_workers(cpi);
       break;
+    case MOD_CDEF: num_mod_workers = compute_num_cdef_workers(cpi); break;
     case MOD_LR: num_mod_workers = compute_num_lr_workers(cpi); break;
+    case MOD_PACK_BS: num_mod_workers = compute_num_pack_bs_workers(cpi); break;
+    case MOD_FRAME_ENC:
+      num_mod_workers = cpi->ppi->p_mt_info.num_mod_workers[MOD_FRAME_ENC];
+      break;
     default: assert(0); break;
   }
   return (num_mod_workers);
@@ -1963,6 +2682,6 @@
 // Computes the number of workers for each MT modules in the encoder
 void av1_compute_num_workers_for_mt(AV1_COMP *cpi) {
   for (int i = MOD_FP; i < NUM_MT_MODULES; i++)
-    cpi->mt_info.num_mod_workers[i] =
+    cpi->ppi->p_mt_info.num_mod_workers[i] =
         compute_num_mod_workers(cpi, (MULTI_THREADED_MODULES)i);
 }

diff --git a/av1/encoder/ethread.h b/av1/encoder/ethread.h
index 55e7f7b..bd75664 100644
--- a/av1/encoder/ethread.h
+++ b/av1/encoder/ethread.h

@@ -22,6 +22,9 @@
 typedef struct EncWorkerData {
   struct AV1_COMP *cpi;
   struct ThreadData *td;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  struct ThreadData *original_td;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   int start;
   int thread_id;
 } EncWorkerData;
@@ -78,15 +81,46 @@
 
 int av1_get_max_num_workers(AV1_COMP *cpi);
 
-void av1_create_workers(AV1_COMP *cpi, int num_workers);
+void av1_create_workers(AV1_PRIMARY *ppi, int num_workers);
 
-void av1_create_second_pass_workers(AV1_COMP *cpi, int num_workers);
+void av1_init_frame_mt(AV1_PRIMARY *ppi, AV1_COMP *cpi);
+
+void av1_init_cdef_worker(AV1_COMP *cpi);
+
+#if !CONFIG_REALTIME_ONLY
+void av1_init_lr_mt_buffers(AV1_COMP *cpi);
+#endif
+
+#if CONFIG_MULTITHREAD
+void av1_init_mt_sync(AV1_COMP *cpi, int is_first_pass);
+#endif  // CONFIG_MULTITHREAD
+
+int av1_get_num_mod_workers_for_alloc(PrimaryMultiThreadInfo *const p_mt_info,
+                                      MULTI_THREADED_MODULES mod_name);
+
+void av1_init_tile_thread_data(AV1_PRIMARY *ppi, int is_first_pass);
 
 void av1_cdef_mse_calc_frame_mt(AV1_COMMON *cm, MultiThreadInfo *mt_info,
                                 CdefSearchCtx *cdef_search_ctx);
 
 void av1_cdef_mt_dealloc(AV1CdefSync *cdef_sync);
 
+void av1_write_tile_obu_mt(
+    AV1_COMP *const cpi, uint8_t *const dst, uint32_t *total_size,
+    struct aom_write_bit_buffer *saved_wb, uint8_t obu_extn_header,
+    const FrameHeaderInfo *fh_info, int *const largest_tile_id,
+    unsigned int *max_tile_size, uint32_t *const obu_header_size,
+    uint8_t **tile_data_start, const int num_workers);
+
+int av1_compute_num_enc_workers(AV1_COMP *cpi, int max_workers);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+int av1_compute_num_fp_contexts(AV1_PRIMARY *ppi, AV1EncoderConfig *oxcf);
+
+int av1_compress_parallel_frames(AV1_PRIMARY *const ppi,
+                                 AV1_COMP_DATA *const first_cpi_data);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/external_partition.c b/av1/encoder/external_partition.c
new file mode 100644
index 0000000..79f8b4c
--- /dev/null
+++ b/av1/encoder/external_partition.c

@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/encoder/external_partition.h"
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  ext_part_controller->funcs = funcs;
+  ext_part_controller->config = config;
+  const aom_ext_part_status_t status = ext_part_controller->funcs.create_model(
+      ext_part_controller->funcs.priv, &ext_part_controller->config,
+      &ext_part_controller->model);
+  if (status == AOM_EXT_PART_ERROR) {
+    return AOM_CODEC_ERROR;
+  } else if (status == AOM_EXT_PART_TEST) {
+    ext_part_controller->test_mode = 1;
+    ext_part_controller->ready = 0;
+    return AOM_CODEC_OK;
+  }
+  assert(status == AOM_EXT_PART_OK);
+  ext_part_controller->ready = 1;
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  av1_zero(ext_part_controller);
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller) {
+  if (ext_part_controller == NULL) {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+  if (ext_part_controller->ready) {
+    const aom_ext_part_status_t status =
+        ext_part_controller->funcs.delete_model(ext_part_controller->model);
+    if (status != AOM_EXT_PART_OK) {
+      return AOM_CODEC_ERROR;
+    }
+  }
+  return av1_ext_part_init(ext_part_controller);
+}
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(decision != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.get_partition_decision(
+          ext_part_controller->model, decision);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(stats != NULL);
+  const aom_ext_part_status_t status =
+      ext_part_controller->funcs.send_partition_stats(
+          ext_part_controller->model, stats);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features) {
+  assert(ext_part_controller != NULL);
+  assert(ext_part_controller->ready);
+  assert(features != NULL);
+  const aom_ext_part_status_t status = ext_part_controller->funcs.send_features(
+      ext_part_controller->model, features);
+  if (status != AOM_EXT_PART_OK) return false;
+  return true;
+}
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller) {
+  return ext_part_controller->funcs.decision_mode;
+}

diff --git a/av1/encoder/external_partition.h b/av1/encoder/external_partition.h
new file mode 100644
index 0000000..f74973e
--- /dev/null
+++ b/av1/encoder/external_partition.h

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+#define AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_
+
+#include <stdbool.h>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*!\cond */
+
+typedef struct ExtPartController {
+  int ready;
+  int test_mode;
+  aom_ext_part_config_t config;
+  aom_ext_part_model_t model;
+  aom_ext_part_funcs_t funcs;
+} ExtPartController;
+
+aom_codec_err_t av1_ext_part_create(aom_ext_part_funcs_t funcs,
+                                    aom_ext_part_config_t config,
+                                    ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_init(ExtPartController *ext_part_controller);
+
+aom_codec_err_t av1_ext_part_delete(ExtPartController *ext_part_controller);
+
+bool av1_ext_part_get_partition_decision(ExtPartController *ext_part_controller,
+                                         aom_partition_decision_t *decision);
+
+bool av1_ext_part_send_partition_stats(ExtPartController *ext_part_controller,
+                                       const aom_partition_stats_t *stats);
+
+bool av1_ext_part_send_features(ExtPartController *ext_part_controller,
+                                const aom_partition_features_t *features);
+
+aom_ext_part_decision_mode_t av1_get_ext_part_decision_mode(
+    const ExtPartController *ext_part_controller);
+
+/*!\endcond */
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_EXTERNAL_PARTITION_H_

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 08fc498..af1e8c1 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c

@@ -20,13 +20,13 @@
 #include "aom_dsp/variance.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_scale/yv12config.h"
 
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
+#include "av1/common/reconintra.h"
 #include "av1/common/txb_common.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/av1_quantize.h"
@@ -54,6 +54,8 @@
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 
+#define INVALID_FP_STATS_TO_PREDICT_FLAT_GOP -1
+
 static AOM_INLINE void output_stats(FIRSTPASS_STATS *stats,
                                     struct aom_codec_pkt_list *pktlist) {
   struct aom_codec_cx_pkt pkt;
@@ -108,6 +110,9 @@
   section->new_mv_count = 0.0;
   section->count = 0.0;
   section->duration = 1.0;
+  section->is_flash = 0;
+  section->noise_var = 0;
+  section->cor_coeff = 1.0;
 }
 
 void av1_accumulate_stats(FIRSTPASS_STATS *section,
@@ -177,8 +182,8 @@
 }
 
 void av1_end_first_pass(AV1_COMP *cpi) {
-  if (cpi->twopass.stats_buf_ctx->total_stats && !cpi->lap_enabled)
-    output_stats(cpi->twopass.stats_buf_ctx->total_stats,
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats && !cpi->ppi->lap_enabled)
+    output_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats,
                  cpi->ppi->output_pkt_list);
 }
 
@@ -262,15 +267,12 @@
   const BLOCK_SIZE bsize = xd->mi[0]->bsize;
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
   const int sr = get_search_range(&cpi->initial_dimensions);
-  const int step_param = 3 + sr;
+  const int step_param = cpi->sf.fp_sf.reduce_mv_step_param + sr;
 
   const search_site_config *first_pass_search_sites =
       cpi->mv_search_params.search_site_cfg[SS_CFG_FPF];
   const int fine_search_interval =
       cpi->is_screen_content_type && cpi->common.features.allow_intrabc;
-  if (fine_search_interval) {
-    av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
-  }
   FULLPEL_MOTION_SEARCH_PARAMS ms_params;
   av1_make_default_fullpel_ms_params(&ms_params, cpi, x, bsize, ref_mv,
                                      first_pass_search_sites,
@@ -282,7 +284,7 @@
                                   &this_best_mv, NULL);
 
   if (tmp_err < INT_MAX) {
-    aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+    aom_variance_fn_ptr_t v_fn_ptr = cpi->ppi->fn_ptr[bsize];
     const MSBuffers *ms_buffers = &ms_params.ms_buffers;
     tmp_err = av1_get_mvpred_sse(&ms_params.mv_cost_params, this_best_mv,
                                  &v_fn_ptr, ms_buffers->src, ms_buffers->ref) +
@@ -329,7 +331,6 @@
 }
 
 static int find_fp_qindex(aom_bit_depth_t bit_depth) {
-  aom_clear_system_state();
   return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
 }
 
@@ -356,6 +357,79 @@
   return raw_err_stdev;
 }
 
+static AOM_INLINE int calc_wavelet_energy(const AV1EncoderConfig *oxcf) {
+  return oxcf->q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL;
+}
+typedef struct intra_pred_block_pass1_args {
+  const SequenceHeader *seq_params;
+  MACROBLOCK *x;
+} intra_pred_block_pass1_args;
+
+static INLINE void copy_rect(uint8_t *dst, int dstride, const uint8_t *src,
+                             int sstride, int width, int height, int use_hbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (use_hbd) {
+    aom_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), sstride,
+                             CONVERT_TO_SHORTPTR(dst), dstride, width, height);
+  } else {
+    aom_convolve_copy(src, sstride, dst, dstride, width, height);
+  }
+#else
+  (void)use_hbd;
+  aom_convolve_copy(src, sstride, dst, dstride, width, height);
+#endif
+}
+
+static void first_pass_intra_pred_and_calc_diff(int plane, int block,
+                                                int blk_row, int blk_col,
+                                                BLOCK_SIZE plane_bsize,
+                                                TX_SIZE tx_size, void *arg) {
+  (void)block;
+  struct intra_pred_block_pass1_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
+  const MB_MODE_INFO *const mbmi = xd->mi[0];
+  const SequenceHeader *seq_params = args->seq_params;
+  const int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[(blk_row * src_stride + blk_col) << MI_SIZE_LOG2];
+
+  av1_predict_intra_block(
+      xd, seq_params->sb_size, seq_params->enable_intra_edge_filter, pd->width,
+      pd->height, tx_size, mbmi->mode, 0, 0, FILTER_INTRA_MODES, src,
+      src_stride, dst, dst_stride, blk_col, blk_row, plane);
+
+  av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+}
+
+static void first_pass_predict_intra_block_for_luma_plane(
+    const SequenceHeader *seq_params, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  assert(bsize < BLOCK_SIZES_ALL);
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int plane = AOM_PLANE_Y;
+  const MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  const int ss_x = pd->subsampling_x;
+  const int ss_y = pd->subsampling_y;
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
+  const int dst_stride = pd->dst.stride;
+  uint8_t *dst = pd->dst.buf;
+  const MACROBLOCK_PLANE *const p = &x->plane[plane];
+  const int src_stride = p->src.stride;
+  const uint8_t *src = p->src.buf;
+
+  intra_pred_block_pass1_args args = { seq_params, x };
+  av1_foreach_transformed_block_in_plane(
+      xd, plane_bsize, plane, first_pass_intra_pred_and_calc_diff, &args);
+
+  // copy source data to recon buffer, as the recon buffer will be used as a
+  // reference frame subsequently.
+  copy_rect(dst, dst_stride, src, src_stride, block_size_wide[bsize],
+            block_size_high[bsize], seq_params->use_highbitdepth);
+}
+
 #define UL_INTRA_THRESH 50
 #define INVALID_ROW -1
 // Computes and returns the intra pred error of a block.
@@ -389,16 +463,14 @@
     const int qindex, FRAME_STATS *const stats) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int unit_scale = mi_size_wide[fp_block_size];
-  const int use_dc_pred = (unit_col || unit_row) && (!unit_col || !unit_row);
   const int num_planes = av1_num_planes(cm);
   const BLOCK_SIZE bsize =
       get_bsize(mi_params, fp_block_size, unit_row, unit_col);
 
-  aom_clear_system_state();
   set_mi_offsets(mi_params, xd, unit_row * unit_scale, unit_col * unit_scale);
   xd->plane[0].dst.buf = this_frame->y_buffer + y_offset;
   xd->plane[1].dst.buf = this_frame->u_buffer + uv_offset;
@@ -413,9 +485,12 @@
   xd->mi[0]->segment_id = 0;
   xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
   xd->mi[0]->mode = DC_PRED;
-  xd->mi[0]->tx_size = use_dc_pred ? max_txsize_lookup[bsize] : TX_4X4;
+  xd->mi[0]->tx_size = TX_4X4;
 
-  av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
+  if (cpi->sf.fp_sf.disable_recon)
+    first_pass_predict_intra_block_for_luma_plane(seq_params, x, bsize);
+  else
+    av1_encode_intra_block_plane(cpi, x, bsize, 0, DRY_RUN_NORMAL, 0);
   int this_intra_error = aom_get_mb_ss(x->plane[0].src_diff);
   if (seq_params->use_highbitdepth) {
     switch (seq_params->bit_depth) {
@@ -436,7 +511,6 @@
     stats->image_data_start_row = unit_row;
   }
 
-  aom_clear_system_state();
   double log_intra = log(this_intra_error + 1.0);
   if (log_intra < 10.0) {
     stats->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
@@ -481,16 +555,22 @@
   // Accumulate the intra error.
   stats->intra_error += (int64_t)this_intra_error;
 
-  const int hbd = is_cur_buf_hbd(xd);
-  const int stride = x->plane[0].src.stride;
-  const int num_8x8_rows = block_size_high[fp_block_size] / 8;
-  const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
-  const uint8_t *buf = x->plane[0].src.buf;
-  for (int r8 = 0; r8 < num_8x8_rows; ++r8) {
-    for (int c8 = 0; c8 < num_8x8_cols; ++c8) {
-      stats->frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
-          buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
-    }
+  // Stats based on wavelet energy is used in the following cases :
+  // 1. ML model which predicts if a flat structure (golden-frame only structure
+  // without ALT-REF and Internal-ARFs) is better. This ML model is enabled in
+  // constant quality mode under certain conditions.
+  // 2. Delta qindex mode is set as DELTA_Q_PERCEPTUAL.
+  // Thus, wavelet energy calculation is enabled for the above cases.
+  if (calc_wavelet_energy(&cpi->oxcf)) {
+    const int hbd = is_cur_buf_hbd(xd);
+    const int stride = x->plane[0].src.stride;
+    const int num_8x8_rows = block_size_high[fp_block_size] / 8;
+    const int num_8x8_cols = block_size_wide[fp_block_size] / 8;
+    const uint8_t *buf = x->plane[0].src.buf;
+    stats->frame_avg_wavelet_energy += av1_haar_ac_sad_mxn_uint8_input(
+        buf, stride, hbd, num_8x8_rows, num_8x8_cols);
+  } else {
+    stats->frame_avg_wavelet_energy = INVALID_FP_STATS_TO_PREDICT_FLAT_GOP;
   }
 
   return this_intra_error;
@@ -517,13 +597,13 @@
 static void accumulate_mv_stats(const MV best_mv, const FULLPEL_MV mv,
                                 const int mb_row, const int mb_col,
                                 const int mb_rows, const int mb_cols,
-                                MV *last_mv, FRAME_STATS *stats) {
+                                MV *last_non_zero_mv, FRAME_STATS *stats) {
   if (is_zero_mv(&best_mv)) return;
 
   ++stats->mv_count;
   // Non-zero vector, was it different from the last non zero vector?
-  if (!is_equal_mv(&best_mv, last_mv)) ++stats->new_mv_count;
-  *last_mv = best_mv;
+  if (!is_equal_mv(&best_mv, last_non_zero_mv)) ++stats->new_mv_count;
+  *last_non_zero_mv = best_mv;
 
   // Does the row vector point inwards or outwards?
   if (mb_row < mb_rows / 2) {
@@ -556,7 +636,6 @@
   }
 }
 
-#define LOW_MOTION_ERROR_THRESH 25
 // Computes and returns the inter prediction error from the last frame.
 // Computes inter prediction errors from the golden and alt ref frams and
 // Updates stats accordingly.
@@ -564,7 +643,6 @@
 //   cpi: the encoder setting. Only a few params in it will be used.
 //   last_frame: the frame buffer of the last frame.
 //   golden_frame: the frame buffer of the golden frame.
-//   alt_ref_frame: the frame buffer of the alt ref frame.
 //   unit_row: row index in the unit of first pass block size.
 //   unit_col: column index in the unit of first pass block size.
 //   recon_yoffset: the y offset of the reconstructed  frame buffer,
@@ -572,13 +650,13 @@
 //   recont_uvoffset: the u/v offset of the reconstructed frame buffer,
 //                    indicating the starting point of the current block.
 //   src_yoffset: the y offset of the source frame buffer.
-//   alt_ref_frame_offset: the y offset of the alt ref frame buffer.
 //   fp_block_size: first pass block size.
 //   this_intra_error: the intra prediction error of this block.
 //   raw_motion_err_counts: the count of raw motion vectors.
 //   raw_motion_err_list: the array that records the raw motion error.
-//   best_ref_mv: best reference mv found so far.
-//   last_mv: last mv.
+//   ref_mv: the reference used to start the motion search
+//   best_mv: the best mv found
+//   last_non_zero_mv: the last non zero mv found in this tile row.
 //   stats: frame encoding stats.
 //  Modifies:
 //    raw_motion_err_list
@@ -589,13 +667,12 @@
 //    this_inter_error
 static int firstpass_inter_prediction(
     AV1_COMP *cpi, ThreadData *td, const YV12_BUFFER_CONFIG *const last_frame,
-    const YV12_BUFFER_CONFIG *const golden_frame,
-    const YV12_BUFFER_CONFIG *const alt_ref_frame, const int unit_row,
+    const YV12_BUFFER_CONFIG *const golden_frame, const int unit_row,
     const int unit_col, const int recon_yoffset, const int recon_uvoffset,
-    const int src_yoffset, const int alt_ref_frame_yoffset,
-    const BLOCK_SIZE fp_block_size, const int this_intra_error,
-    const int raw_motion_err_counts, int *raw_motion_err_list, MV *best_ref_mv,
-    MV *last_mv, FRAME_STATS *stats) {
+    const int src_yoffset, const BLOCK_SIZE fp_block_size,
+    const int this_intra_error, const int raw_motion_err_counts,
+    int *raw_motion_err_list, const MV ref_mv, MV *best_mv,
+    MV *last_non_zero_mv, FRAME_STATS *stats) {
   int this_inter_error = this_intra_error;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
@@ -613,7 +690,6 @@
   const int unit_cols = get_unit_cols(fp_block_size, mi_params->mb_cols);
   // Assume 0,0 motion with no mv overhead.
   FULLPEL_MV mv = kZeroFullMv;
-  FULLPEL_MV tmp_mv = kZeroFullMv;
   xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
@@ -636,16 +712,17 @@
       is_high_bitdepth, bitdepth, bsize, &x->plane[0].src,
       &unscaled_last_source_buf_2d);
   raw_motion_err_list[raw_motion_err_counts] = raw_motion_error;
+  const FIRST_PASS_SPEED_FEATURES *const fp_sf = &cpi->sf.fp_sf;
 
-  // TODO(pengchong): Replace the hard-coded threshold
-  if (raw_motion_error > LOW_MOTION_ERROR_THRESH || cpi->oxcf.speed <= 2) {
+  if (raw_motion_error > fp_sf->skip_motion_search_threshold) {
     // Test last reference frame using the previous best mv as the
     // starting point (best reference) for the search.
-    first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+    first_pass_motion_search(cpi, x, &ref_mv, &mv, &motion_error);
 
     // If the current best reference mv is not centered on 0,0 then do a
     // 0,0 based search as well.
-    if (!is_zero_mv(best_ref_mv)) {
+    if ((fp_sf->skip_zeromv_motion_search == 0) && !is_zero_mv(&ref_mv)) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       int tmp_err = INT_MAX;
       first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
 
@@ -658,6 +735,7 @@
     // Motion search in 2nd reference frame.
     int gf_motion_error = motion_error;
     if ((current_frame->frame_number > 1) && golden_frame != NULL) {
+      FULLPEL_MV tmp_mv = kZeroFullMv;
       // Assume 0,0 motion with no mv overhead.
       xd->plane[0].pre[0].buf = golden_frame->y_buffer + recon_yoffset;
       xd->plane[0].pre[0].stride = golden_frame->y_stride;
@@ -681,48 +759,18 @@
       stats->sr_coded_error += motion_error;
     }
 
-    // Motion search in 3rd reference frame.
-    int alt_motion_error = motion_error;
-    if (alt_ref_frame != NULL) {
-      xd->plane[0].pre[0].buf = alt_ref_frame->y_buffer + alt_ref_frame_yoffset;
-      xd->plane[0].pre[0].stride = alt_ref_frame->y_stride;
-      alt_motion_error =
-          get_prediction_error_bitdepth(is_high_bitdepth, bitdepth, bsize,
-                                        &x->plane[0].src, &xd->plane[0].pre[0]);
-      first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &alt_motion_error);
-    }
-    if (alt_motion_error < motion_error && alt_motion_error < gf_motion_error &&
-        alt_motion_error < this_intra_error) {
-      ++stats->third_ref_count;
-    }
-    // In accumulating a score for the 3rd reference frame take the
-    // best of the motion predicted score and the intra coded error
-    // (just as will be done for) accumulation of "coded_error" for
-    // the last frame.
-    if (alt_ref_frame != NULL) {
-      stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error);
-    } else {
-      // TODO(chengchen): I believe logically this should also be changed to
-      // stats->tr_coded_error += AOMMIN(alt_motion_error, this_intra_error).
-      stats->tr_coded_error += motion_error;
-    }
-
     // Reset to last frame as reference buffer.
     xd->plane[0].pre[0].buf = last_frame->y_buffer + recon_yoffset;
     xd->plane[1].pre[0].buf = last_frame->u_buffer + recon_uvoffset;
     xd->plane[2].pre[0].buf = last_frame->v_buffer + recon_uvoffset;
   } else {
     stats->sr_coded_error += motion_error;
-    stats->tr_coded_error += motion_error;
   }
 
   // Start by assuming that intra mode is best.
-  best_ref_mv->row = 0;
-  best_ref_mv->col = 0;
+  *best_mv = kZeroMv;
 
   if (motion_error <= this_intra_error) {
-    aom_clear_system_state();
-
     // Keep a count of cases where the inter and intra were very close
     // and very low. This helps with scene cut detection for example in
     // cropped clips with black bars at the sides or top and bottom.
@@ -737,33 +785,55 @@
           (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_intra_error);
     }
 
-    const MV best_mv = get_mv_from_fullmv(&mv);
+    *best_mv = get_mv_from_fullmv(&mv);
     this_inter_error = motion_error;
     xd->mi[0]->mode = NEWMV;
-    xd->mi[0]->mv[0].as_mv = best_mv;
+    xd->mi[0]->mv[0].as_mv = *best_mv;
     xd->mi[0]->tx_size = TX_4X4;
     xd->mi[0]->ref_frame[0] = LAST_FRAME;
     xd->mi[0]->ref_frame[1] = NONE_FRAME;
-    av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
-                                  unit_col * unit_scale, NULL, bsize,
-                                  AOM_PLANE_Y, AOM_PLANE_Y);
-    av1_encode_sby_pass1(cpi, x, bsize);
-    stats->sum_mvr += best_mv.row;
-    stats->sum_mvr_abs += abs(best_mv.row);
-    stats->sum_mvc += best_mv.col;
-    stats->sum_mvc_abs += abs(best_mv.col);
-    stats->sum_mvrs += best_mv.row * best_mv.row;
-    stats->sum_mvcs += best_mv.col * best_mv.col;
+
+    if (fp_sf->disable_recon == 0) {
+      av1_enc_build_inter_predictor(cm, xd, unit_row * unit_scale,
+                                    unit_col * unit_scale, NULL, bsize,
+                                    AOM_PLANE_Y, AOM_PLANE_Y);
+      av1_encode_sby_pass1(cpi, x, bsize);
+    }
+    stats->sum_mvr += best_mv->row;
+    stats->sum_mvr_abs += abs(best_mv->row);
+    stats->sum_mvc += best_mv->col;
+    stats->sum_mvc_abs += abs(best_mv->col);
+    stats->sum_mvrs += best_mv->row * best_mv->row;
+    stats->sum_mvcs += best_mv->col * best_mv->col;
     ++stats->inter_count;
 
-    *best_ref_mv = best_mv;
-    accumulate_mv_stats(best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
-                        last_mv, stats);
+    accumulate_mv_stats(*best_mv, mv, unit_row, unit_col, unit_rows, unit_cols,
+                        last_non_zero_mv, stats);
   }
 
   return this_inter_error;
 }
 
+// Normalize the first pass stats.
+// Error / counters are normalized to each MB.
+// MVs are normalized to the width/height of the frame.
+static void normalize_firstpass_stats(FIRSTPASS_STATS *fps,
+                                      double num_mbs_16x16, double f_w,
+                                      double f_h) {
+  fps->coded_error /= num_mbs_16x16;
+  fps->sr_coded_error /= num_mbs_16x16;
+  fps->intra_error /= num_mbs_16x16;
+  fps->frame_avg_wavelet_energy /= num_mbs_16x16;
+
+  fps->MVr /= f_h;
+  fps->mvr_abs /= f_h;
+  fps->MVc /= f_w;
+  fps->mvc_abs /= f_w;
+  fps->MVrv /= (f_h * f_h);
+  fps->MVcv /= (f_w * f_w);
+  fps->new_mv_count /= num_mbs_16x16;
+}
+
 // Updates the first pass stats of this frame.
 // Input:
 //   cpi: the encoder setting. Only a few params in it will be used.
@@ -784,7 +854,7 @@
                                    const int frame_number,
                                    const int64_t ts_duration,
                                    const BLOCK_SIZE fp_block_size) {
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   FIRSTPASS_STATS *this_frame_stats = twopass->stats_buf_ctx->stats_in_end;
@@ -806,18 +876,19 @@
   fps.frame = frame_number;
   fps.coded_error = (double)(stats->coded_error >> 8) + min_err;
   fps.sr_coded_error = (double)(stats->sr_coded_error >> 8) + min_err;
-  fps.tr_coded_error = (double)(stats->tr_coded_error >> 8) + min_err;
   fps.intra_error = (double)(stats->intra_error >> 8) + min_err;
   fps.frame_avg_wavelet_energy = (double)stats->frame_avg_wavelet_energy;
   fps.count = 1.0;
   fps.pcnt_inter = (double)stats->inter_count / num_mbs;
   fps.pcnt_second_ref = (double)stats->second_ref_count / num_mbs;
-  fps.pcnt_third_ref = (double)stats->third_ref_count / num_mbs;
   fps.pcnt_neutral = (double)stats->neutral_count / num_mbs;
   fps.intra_skip_pct = (double)stats->intra_skip_count / num_mbs;
   fps.inactive_zone_rows = (double)stats->image_data_start_row;
   fps.inactive_zone_cols = (double)0;  // TODO(paulwilkins): fix
   fps.raw_error_stdev = raw_err_stdev;
+  fps.is_flash = 0;
+  fps.noise_var = (double)0;
+  fps.cor_coeff = (double)1.0;
 
   if (stats->mv_count > 0) {
     fps.MVr = (double)stats->sum_mvr / stats->mv_count;
@@ -850,19 +921,25 @@
   // cpi->source_time_stamp.
   fps.duration = (double)ts_duration;
 
+  normalize_firstpass_stats(&fps, num_mbs_16X16, cm->width, cm->height);
+
   // We will store the stats inside the persistent twopass struct (and NOT the
   // local variable 'fps'), and then cpi->output_pkt_list will point to it.
   *this_frame_stats = fps;
-  if (!cpi->lap_enabled)
+  if (!cpi->ppi->lap_enabled) {
     output_stats(this_frame_stats, cpi->ppi->output_pkt_list);
-  if (cpi->twopass.stats_buf_ctx->total_stats != NULL) {
-    av1_accumulate_stats(cpi->twopass.stats_buf_ctx->total_stats, &fps);
+  } else {
+    av1_firstpass_info_push(&twopass->firstpass_info, this_frame_stats);
+  }
+  if (cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL) {
+    av1_accumulate_stats(cpi->ppi->twopass.stats_buf_ctx->total_stats, &fps);
   }
   /*In the case of two pass, first pass uses it as a circular buffer,
    * when LAP is enabled it is used as a linear buffer*/
   twopass->stats_buf_ctx->stats_in_end++;
-  if ((cpi->oxcf.pass == 1) && (twopass->stats_buf_ctx->stats_in_end >=
-                                twopass->stats_buf_ctx->stats_in_buf_end)) {
+  if ((cpi->oxcf.pass == AOM_RC_FIRST_PASS) &&
+      (twopass->stats_buf_ctx->stats_in_end >=
+       twopass->stats_buf_ctx->stats_in_buf_end)) {
     twopass->stats_buf_ctx->stats_in_end =
         twopass->stats_buf_ctx->stats_in_start;
   }
@@ -919,8 +996,6 @@
       stats.sum_mvr += mb_stat.sum_mvr;
       stats.sum_mvr_abs += mb_stat.sum_mvr_abs;
       stats.sum_mvrs += mb_stat.sum_mvrs;
-      stats.third_ref_count += mb_stat.third_ref_count;
-      stats.tr_coded_error += mb_stat.tr_coded_error;
     }
   }
   return stats;
@@ -987,7 +1062,8 @@
   const int num_planes = av1_num_planes(&cpi->common);
   for (int plane = 0; plane < num_planes; plane++) {
     const int subsampling_xy =
-        plane ? cm->seq_params.subsampling_x + cm->seq_params.subsampling_y : 0;
+        plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y
+              : 0;
     const int sb_size = MAX_SB_SQUARE >> subsampling_xy;
     CHECK_MEM_ERROR(
         cm, cpi->td.mb.plane[plane].src_diff,
@@ -1014,8 +1090,7 @@
   MACROBLOCK *const x = &td->mb;
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
-  CurrentFrame *const current_frame = &cm->current_frame;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo *tile = &tile_data->tile_info;
@@ -1038,18 +1113,6 @@
       get_ref_frame_yv12_buf(cm, LAST_FRAME);
   const YV12_BUFFER_CONFIG *golden_frame =
       get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
-  const YV12_BUFFER_CONFIG *alt_ref_frame = NULL;
-  const int alt_ref_offset =
-      FIRST_PASS_ALT_REF_DISTANCE -
-      (current_frame->frame_number % FIRST_PASS_ALT_REF_DISTANCE);
-  if (alt_ref_offset < FIRST_PASS_ALT_REF_DISTANCE) {
-    const struct lookahead_entry *const alt_ref_frame_buffer =
-        av1_lookahead_peek(cpi->ppi->lookahead, alt_ref_offset,
-                           cpi->compressor_stage);
-    if (alt_ref_frame_buffer != NULL) {
-      alt_ref_frame = &alt_ref_frame_buffer->img;
-    }
-  }
   YV12_BUFFER_CONFIG *const this_frame = &cm->cur_frame->buf;
 
   PICK_MODE_CONTEXT *ctx = td->firstpass_ctx;
@@ -1084,11 +1147,6 @@
                     (unit_col_start * fp_block_size_width);
   int recon_uvoffset = (unit_row * recon_uv_stride * uv_mb_height) +
                        (unit_col_start * uv_mb_height);
-  int alt_ref_frame_yoffset =
-      (alt_ref_frame != NULL)
-          ? (unit_row * alt_ref_frame->y_stride * fp_block_size_height) +
-                (unit_col_start * fp_block_size_width)
-          : -1;
 
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
@@ -1120,10 +1178,10 @@
 
     if (!frame_is_intra_only(cm)) {
       const int this_inter_error = firstpass_inter_prediction(
-          cpi, td, last_frame, golden_frame, alt_ref_frame, unit_row, unit_col,
-          recon_yoffset, recon_uvoffset, src_yoffset, alt_ref_frame_yoffset,
-          fp_block_size, this_intra_error, raw_motion_err_counts,
-          raw_motion_err_list, &best_ref_mv, &last_mv, mb_stats);
+          cpi, td, last_frame, golden_frame, unit_row, unit_col, recon_yoffset,
+          recon_uvoffset, src_yoffset, fp_block_size, this_intra_error,
+          raw_motion_err_counts, raw_motion_err_list, best_ref_mv, &best_ref_mv,
+          &last_mv, mb_stats);
       if (unit_col_in_tile == 0) {
         *first_top_mv = last_mv;
       }
@@ -1131,7 +1189,6 @@
       ++raw_motion_err_counts;
     } else {
       mb_stats->sr_coded_error += this_intra_error;
-      mb_stats->tr_coded_error += this_intra_error;
       mb_stats->coded_error += this_intra_error;
     }
 
@@ -1143,7 +1200,6 @@
     recon_yoffset += fp_block_size_width;
     src_yoffset += fp_block_size_width;
     recon_uvoffset += uv_mb_height;
-    alt_ref_frame_yoffset += fp_block_size_width;
     mb_stats++;
 
     (*(enc_row_mt->sync_write_ptr))(row_mt_sync, unit_row_in_tile,
@@ -1156,7 +1212,7 @@
   AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   CurrentFrame *const current_frame = &cm->current_frame;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int qindex = find_fp_qindex(seq_params->bit_depth);
@@ -1165,9 +1221,14 @@
     FeatureFlags *const features = &cm->features;
     av1_set_screen_content_options(cpi, features);
   }
+
+  // Prepare the speed features
+  av1_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
+
   // Unit size for the first pass encoding.
   const BLOCK_SIZE fp_block_size =
-      cpi->is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16;
+      get_fp_block_size(cpi->is_screen_content_type);
+
   // Number of rows in the unit size.
   // Note mi_params->mb_rows and mi_params->mb_cols are in the unit of 16x16.
   const int unit_rows = get_unit_rows(fp_block_size, mi_params->mb_rows);
@@ -1203,7 +1264,6 @@
   assert(frame_is_intra_only(cm) || (last_frame != NULL));
 
   av1_setup_frame_size(cpi);
-  aom_clear_system_state();
 
   set_mi_offsets(mi_params, xd, 0, 0);
   xd->mi[0]->bsize = fp_block_size;
@@ -1211,9 +1271,9 @@
   // Do not use periodic key frames.
   cpi->rc.frames_to_key = INT_MAX;
 
-  av1_set_quantizer(cm, cpi->oxcf.q_cfg.qm_minlevel,
-                    cpi->oxcf.q_cfg.qm_maxlevel, qindex,
-                    cpi->oxcf.q_cfg.enable_chroma_deltaq);
+  av1_set_quantizer(
+      cm, cpi->oxcf.q_cfg.qm_minlevel, cpi->oxcf.q_cfg.qm_maxlevel, qindex,
+      cpi->oxcf.q_cfg.enable_chroma_deltaq, cpi->oxcf.q_cfg.enable_hdr_deltaq);
 
   av1_setup_block_planes(xd, seq_params->subsampling_x,
                          seq_params->subsampling_y, num_planes);
@@ -1268,7 +1328,7 @@
                       (stats.image_data_start_row * unit_cols * 2));
   }
 
-  TWO_PASS *twopass = &cpi->twopass;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
   const int num_mbs_16X16 = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
                                 ? cpi->initial_mbs
                                 : mi_params->MBs;
@@ -1319,3 +1379,121 @@
 
   ++current_frame->frame_number;
 }
+
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size) {
+  assert(IMPLIES(ext_stats_buf == NULL, ext_stats_buf_size == 0));
+  if (ext_stats_buf == NULL) {
+    firstpass_info->stats_buf = firstpass_info->static_stats_buf;
+    firstpass_info->stats_buf_size =
+        sizeof(firstpass_info->static_stats_buf) /
+        sizeof(firstpass_info->static_stats_buf[0]);
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = 0;
+    firstpass_info->future_stats_count = 0;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    if (ext_stats_buf_size == 0) {
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    firstpass_info->stats_buf = ext_stats_buf;
+    firstpass_info->stats_buf_size = ext_stats_buf_size;
+    firstpass_info->start_index = 0;
+    firstpass_info->cur_index = 0;
+    firstpass_info->stats_count = firstpass_info->stats_buf_size;
+    firstpass_info->future_stats_count = firstpass_info->stats_count;
+    firstpass_info->past_stats_count = 0;
+    av1_zero(firstpass_info->total_stats);
+    for (int i = 0; i < firstpass_info->stats_count; ++i) {
+      av1_accumulate_stats(&firstpass_info->total_stats,
+                           &firstpass_info->stats_buf[i]);
+    }
+  }
+  return AOM_CODEC_OK;
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info) {
+  assert(firstpass_info->future_stats_count +
+             firstpass_info->past_stats_count ==
+         firstpass_info->stats_count);
+  if (firstpass_info->future_stats_count > 1) {
+    firstpass_info->cur_index =
+        (firstpass_info->cur_index + 1) % firstpass_info->stats_buf_size;
+    --firstpass_info->future_stats_count;
+    ++firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info) {
+  if (firstpass_info->stats_count > 0 && firstpass_info->past_stats_count > 0) {
+    const int next_start =
+        (firstpass_info->start_index + 1) % firstpass_info->stats_buf_size;
+    firstpass_info->start_index = next_start;
+    --firstpass_info->stats_count;
+    --firstpass_info->past_stats_count;
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info) {
+  aom_codec_err_t ret = av1_firstpass_info_move_cur_index(firstpass_info);
+  if (ret != AOM_CODEC_OK) return ret;
+  ret = av1_firstpass_info_pop(firstpass_info);
+  return ret;
+}
+
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats) {
+  if (firstpass_info->stats_count < firstpass_info->stats_buf_size) {
+    const int next_index =
+        (firstpass_info->start_index + firstpass_info->stats_count) %
+        firstpass_info->stats_buf_size;
+    firstpass_info->stats_buf[next_index] = *input_stats;
+    ++firstpass_info->stats_count;
+    ++firstpass_info->future_stats_count;
+    av1_accumulate_stats(&firstpass_info->total_stats, input_stats);
+    return AOM_CODEC_OK;
+  } else {
+    return AOM_CODEC_ERROR;
+  }
+}
+
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count &&
+      offset_from_cur < firstpass_info->future_stats_count) {
+    const int index = (firstpass_info->cur_index + offset_from_cur) %
+                      firstpass_info->stats_buf_size;
+    return &firstpass_info->stats_buf[index];
+  } else {
+    return NULL;
+  }
+}
+
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur) {
+  if (offset_from_cur < firstpass_info->future_stats_count) {
+    return firstpass_info->future_stats_count - offset_from_cur;
+  }
+  return 0;
+}
+
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur) {
+  if (offset_from_cur >= -firstpass_info->past_stats_count) {
+    return offset_from_cur + firstpass_info->past_stats_count;
+  }
+  return 0;
+}

diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index e3706d9..88f9b27 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h

@@ -29,9 +29,14 @@
 #define MIN_MV_IN_OUT 0.4
 
 #define VLOW_MOTION_THRESHOLD 950
+struct ThreadData;
 
 /*!
  * \brief The stucture of acummulated frame stats in the first pass.
+ *
+ * Errors (coded_error, intra_error, etc.) and counters (new_mv_count) are
+ * normalized to each MB. MV related stats (MVc, MVr, etc.) are normalized to
+ * the frame width and height. See function normalize_firstpass_stats.
  */
 typedef struct {
   /*!
@@ -62,10 +67,6 @@
    */
   double sr_coded_error;
   /*!
-   * Best of intra pred error and inter pred error using altref frame as ref.
-   */
-  double tr_coded_error;
-  /*!
    * Percentage of blocks with inter pred error < intra pred error.
    */
   double pcnt_inter;
@@ -80,10 +81,6 @@
    */
   double pcnt_second_ref;
   /*!
-   * Percentage of blocks where altref frame was better than intra, last, golden
-   */
-  double pcnt_third_ref;
-  /*!
    * Percentage of blocks where intra and inter prediction errors were very
    * close. Note that this is a 'weighted count', that is, the so blocks may be
    * weighted by how close the two errors were.
@@ -152,23 +149,199 @@
    * standard deviation for (0, 0) motion prediction error
    */
   double raw_error_stdev;
+  /*!
+   * Whether the frame contains a flash
+   */
+  int64_t is_flash;
+  /*!
+   * Estimated noise variance
+   */
+  double noise_var;
+  /*!
+   * Correlation coefficient with the previous frame
+   */
+  double cor_coeff;
 } FIRSTPASS_STATS;
 
-/*!\cond */
+// We want to keep one past stats for key frame detection
+// in test_candidate_kf()
+#define FIRSTPASS_INFO_STATS_PAST_MIN 1
 
+// The size of static buffer used in FIRSTPASS_INFO.
+#define FIRSTPASS_INFO_STATIC_BUF_SIZE \
+  (MAX_LAP_BUFFERS + FIRSTPASS_INFO_STATS_PAST_MIN)
+
+/*!
+ * \brief  Data structure used for managing first pass stats
+ */
+typedef struct {
+  /*!
+   * A static buffer that will be used when no ext_stats_buf is assigned. The
+   * ext_stats_buf is assigned through av1_firstpass_info_init() when the user
+   * already has a pre-existing firstpass stats that is stored in an external
+   * buffer. The ext_stats_buf is usually used in two pass mode. When using one
+   * pass mode, we generate "firstpass" stats and encode the video in the same
+   * pass. In this scenario, the stats will be pushed and popped from
+   * static_stats_buf.
+   */
+  FIRSTPASS_STATS static_stats_buf[FIRSTPASS_INFO_STATIC_BUF_SIZE];
+  /*!
+   * A pointer to first pass stats.
+   * Note that this buffer will be used as ring buffer.
+   */
+  FIRSTPASS_STATS *stats_buf;
+  /*!
+   * size of stats_buf
+   */
+  int stats_buf_size;
+  /*!
+   * start index of the available frame stats
+   * Note that start_index doesn't always point to
+   * current frame's stats because we need to
+   * keep past stats as well. To access current
+   * frame's stats, please use cur_index.
+   */
+  int start_index;
+
+  /*!
+   * count available stats stored in stats_buf
+   * the following condition should stay true
+   * stats_count = future_stats_count + past_stats_count
+   */
+  int stats_count;
+
+  /*!
+   *  index of the current frame's stats
+   */
+  int cur_index;
+
+  /*!
+   * count available future stats including current stats
+   */
+  int future_stats_count;
+
+  /*!
+   * count available past stats EXCLUDING current stats
+   */
+  int past_stats_count;
+
+  /*!
+   * Accumulation of the stats being pushed into firstpass_info
+   */
+  FIRSTPASS_STATS total_stats;
+} FIRSTPASS_INFO;
+
+/*!\brief Init firstpass_info
+ *
+ * If using ext_stats_buf, the buffer needs to stay available during encoding
+ * process.
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \param[in]    ext_stats_buf       external stats buffer. Pass in NULL if
+ *                                   choose to use internal static_stats_buf.
+ * \param[in]    ext_stats_buf_size  external stats buffer size. Pass in 0 if
+ * choose to use internal static_stats_buf. \return status
+ */
+aom_codec_err_t av1_firstpass_info_init(FIRSTPASS_INFO *firstpass_info,
+                                        FIRSTPASS_STATS *ext_stats_buf,
+                                        int ext_stats_buf_size);
+
+/*!\brief Move cur_index by 1
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_pop(FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Move cur_index by 1 and pop a stats from firstpass_info
+ *
+ * \ingroup rate_control
+ * \param[out]   firstpass_info      struct of firstpass_info.
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_move_cur_index_and_pop(
+    FIRSTPASS_INFO *firstpass_info);
+
+/*!\brief Push a stats into firstpass_info
+ *
+ * Note that the input stats will be copied into firstpass_info.
+ * \ingroup rate_control
+ * \param[out]  firstpass_info      struct of firstpass_info.
+ * \param[in]   input_stats         input stats
+ * \return status
+ */
+aom_codec_err_t av1_firstpass_info_push(FIRSTPASS_INFO *firstpass_info,
+                                        const FIRSTPASS_STATS *input_stats);
+
+/*!\brief Peek at a stats from firstpass_info
+ *
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info      struct of firstpass_info.
+ * \param[in]  offset_from_cur  index offset from cur_index.
+ * \return pointer to the stats. The pointer will be NULL if
+ *         stats_index_offset is invalid.
+ */
+const FIRSTPASS_STATS *av1_firstpass_info_peek(
+    const FIRSTPASS_INFO *firstpass_info, int offset_from_cur);
+
+/*!\brief Count the future stats from the target in firstpass_info
+ * Note that the target stats will be counted as well.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's inffset
+ *                               from cur_index.
+ * \return Number of stats in the future after the target stats
+ *         including itself.
+ */
+int av1_firstpass_info_future_count(const FIRSTPASS_INFO *firstpass_info,
+                                    int offset_from_cur);
+
+/*!\brief Count the past stats before the target in firstpass_info
+ * Note that the target stats will NOT be counted.
+ * The target index is as follows.
+ * (cur_index + offset_from_cur) % firstpass_info->stats_buf_size
+ *
+ * \ingroup rate_control
+ * \param[in]  firstpass_info    struct of firstpass_info.
+ * \param[in]  offset_from_cur  target stats's index offset
+ *                               from cur_index.
+ * \return Number of stats in the past before the target stats
+ *         excluding itself.
+ */
+int av1_firstpass_info_past_count(const FIRSTPASS_INFO *firstpass_info,
+                                  int offset_from_cur);
+
+/*!\cond */
 #define FC_ANIMATION_THRESH 0.15
 enum {
   FC_NORMAL = 0,
   FC_GRAPHICS_ANIMATION = 1,
   FRAME_CONTENT_TYPES = 2
 } UENUM1BYTE(FRAME_CONTENT_TYPE);
-
 /*!\endcond */
+
 /*!
  * \brief  Data related to the current GF/ARF group and the
  * individual frames within the group
  */
-typedef struct {
+typedef struct GF_GROUP {
   /*!\cond */
   // Frame update type, e.g. ARF/GF/LF/Overlay
   FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH];
@@ -181,7 +354,7 @@
   int max_layer_depth;
   int max_layer_depth_allowed;
   // This is currently only populated for AOM_Q mode
-  unsigned char q_val[MAX_STATIC_GF_GROUP_LENGTH];
+  int q_val[MAX_STATIC_GF_GROUP_LENGTH];
   int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH];
   // The frame coding type - inter/intra frame
   FRAME_TYPE frame_type[MAX_STATIC_GF_GROUP_LENGTH];
@@ -189,6 +362,31 @@
   REFBUF_STATE refbuf_state[MAX_STATIC_GF_GROUP_LENGTH];
   int arf_index;  // the index in the gf group of ARF, if no arf, then -1
   int size;       // The total length of a GOP
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Indicates the level of parallelism in frame parallel encodes.
+  // 0 : frame is independently encoded (not part of parallel encodes).
+  // 1 : frame is the first in encode order in a given parallel encode set.
+  // 2 : frame occurs later in encode order in a given parallel encode set.
+  int frame_parallel_level[MAX_STATIC_GF_GROUP_LENGTH];
+  // Indicates whether a frame should act as non-reference frame.
+  // 0 : frame is a reference frame.
+  // 1 : frame is a non-reference frame.
+  int is_frame_non_ref[MAX_STATIC_GF_GROUP_LENGTH];
+
+  // The offset into lookahead_ctx for choosing
+  // source of frame parallel encodes.
+  int src_offset[MAX_STATIC_GF_GROUP_LENGTH];
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Stores the display order hint of each frame in the current GF_GROUP.
+  int display_idx[MAX_STATIC_GF_GROUP_LENGTH];
+  // Stores the display order hint of the frames not to be
+  // refreshed by the current frame.
+  int skip_frame_refresh[MAX_STATIC_GF_GROUP_LENGTH][REF_FRAMES];
+  // Stores the display order hint of the frame to be excluded during reference
+  // assignment.
+  int skip_frame_as_ref[MAX_STATIC_GF_GROUP_LENGTH];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
   /*!\endcond */
 } GF_GROUP;
 /*!\cond */
@@ -219,24 +417,20 @@
   // here.
   FIRSTPASS_STATS *frame_stats_arr[MAX_LAP_BUFFERS + 1];
   int frame_stats_next_idx;  // Index to next unused element in frame_stats_arr.
-  const FIRSTPASS_STATS *stats_in;
   STATS_BUFFER_CTX *stats_buf_ctx;
+  FIRSTPASS_INFO firstpass_info;  // This is the first pass data structure
+                                  // intended to replace stats_in
   int first_pass_done;
   int64_t bits_left;
   double modified_error_min;
   double modified_error_max;
   double modified_error_left;
-  double mb_av_energy;
-  double frame_avg_haar_energy;
-
-  // An indication of the content type of the current frame
-  FRAME_CONTENT_TYPE fr_content_type;
 
   // Projected total bits available for a key frame group of frames
   int64_t kf_group_bits;
 
   // Error score of frames still to be coded in kf group
-  int64_t kf_group_error_left;
+  double kf_group_error_left;
 
   // Over time correction for bits per macro block estimation
   double bpm_factor;
@@ -255,6 +449,21 @@
   /*!\endcond */
 } TWO_PASS;
 
+/*!
+ * \brief Frame level Two pass status and control data.
+ */
+typedef struct {
+  /*!\cond */
+  const FIRSTPASS_STATS *stats_in;
+  // Pointer to the stats of the current frame.
+  const FIRSTPASS_STATS *this_frame;
+  double mb_av_energy;
+  // An indication of the content type of the current frame
+  FRAME_CONTENT_TYPE fr_content_type;
+  double frame_avg_haar_energy;
+  /*!\endcond */
+} TWO_PASS_FRAME;
+
 /*!\cond */
 
 // This structure contains several key parameters to be accumulated for this
@@ -268,8 +477,6 @@
   int64_t coded_error;
   // Best of intra pred error and inter pred error using golden frame as ref.
   int64_t sr_coded_error;
-  // Best of intra pred error and inter pred error using altref frame as ref.
-  int64_t tr_coded_error;
   // Count of motion vector.
   int mv_count;
   // Count of blocks that pick inter prediction (inter pred error is smaller
@@ -277,8 +484,6 @@
   int inter_count;
   // Count of blocks that pick second ref (golden frame).
   int second_ref_count;
-  // Count of blocks that pick third ref (altref frame).
-  int third_ref_count;
   // Count of blocks where the inter and intra are very close and very low.
   double neutral_count;
   // Count of blocks where intra error is very small.
@@ -325,6 +530,16 @@
 struct AV1EncoderConfig;
 struct TileDataEnc;
 
+static INLINE int is_fp_wavelet_energy_invalid(
+    const FIRSTPASS_STATS *fp_stats) {
+  assert(fp_stats != NULL);
+  return (fp_stats->frame_avg_wavelet_energy < 0);
+}
+
+static INLINE BLOCK_SIZE get_fp_block_size(int is_screen_content_type) {
+  return (is_screen_content_type ? BLOCK_8X8 : BLOCK_16X16);
+}
+
 int av1_get_unit_rows_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 int av1_get_unit_cols_in_tile(TileInfo tile, const BLOCK_SIZE fp_block_size);
 

diff --git a/av1/encoder/global_motion_facade.c b/av1/encoder/global_motion_facade.c
index 3a7af51..f91ef04 100644
--- a/av1/encoder/global_motion_facade.c
+++ b/av1/encoder/global_motion_facade.c

@@ -10,7 +10,6 @@
  */
 
 #include "aom_dsp/binary_codes_writer.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/encoder/corner_detect.h"
 #include "av1/encoder/encoder.h"
@@ -102,16 +101,14 @@
   assert(ref_buf[frame] != NULL);
   TransformationType model;
 
-  aom_clear_system_state();
-
   // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
   const int do_adaptive_gm_estimation = 0;
 
   const int ref_frame_dist = get_relative_dist(
-      &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+      &cm->seq_params->order_hint_info, cm->current_frame.order_hint,
       cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
   const GlobalMotionEstimationType gm_estimation_type =
-      cm->seq_params.order_hint_info.enable_order_hint &&
+      cm->seq_params->order_hint_info.enable_order_hint &&
               abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
           ? GLOBAL_MOTION_DISFLOW_BASED
           : GLOBAL_MOTION_FEATURE_BASED;
@@ -126,7 +123,7 @@
 
     av1_compute_global_motion(model, src_buffer, src_width, src_height,
                               src_stride, src_corners, num_src_corners,
-                              ref_buf[frame], cpi->common.seq_params.bit_depth,
+                              ref_buf[frame], cpi->common.seq_params->bit_depth,
                               gm_estimation_type, inliers_by_motion,
                               params_by_motion, RANSAC_NUM_MOTIONS);
     int64_t ref_frame_error = 0;
@@ -197,8 +194,6 @@
 
     if (cm->global_motion[frame].wmtype != IDENTITY) break;
   }
-
-  aom_clear_system_state();
 }
 
 // Computes global motion for the given reference frame.
@@ -284,7 +279,7 @@
   AV1_COMMON *const cm = &cpi->common;
   int *num_past_ref_frames = &num_ref_frames[0];
   int *num_future_ref_frames = &num_ref_frames[1];
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
       gf_group, cpi->sf.inter_sf.selective_ref_frame, 1, cpi->gf_frame_index);
 
@@ -368,7 +363,7 @@
     // The source buffer is 16-bit, so we need to convert to 8 bits for the
     // following code. We cache the result until the source frame is released.
     gm_info->src_buffer =
-        av1_downconvert_frame(source, cpi->common.seq_params.bit_depth);
+        av1_downconvert_frame(source, cpi->common.seq_params->bit_depth);
   }
 
   gm_info->segment_map_w =

diff --git a/av1/encoder/gop_structure.c b/av1/encoder/gop_structure.c
index 0e4968a..2e53c70 100644
--- a/av1/encoder/gop_structure.c
+++ b/av1/encoder/gop_structure.c

@@ -18,20 +18,432 @@
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/gop_structure.h"
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+// This function sets gf_group->frame_parallel_level for LF_UPDATE frames based
+// on the value of parallel_frame_count.
+static void set_frame_parallel_level(int *frame_parallel_level,
+                                     int *parallel_frame_count,
+                                     int max_parallel_frames) {
+  assert(*parallel_frame_count > 0);
+  // parallel_frame_count > 1 indicates subsequent frame(s) in the current
+  // parallel encode set.
+  *frame_parallel_level = 1 + (*parallel_frame_count > 1);
+  // Update the count of no. of parallel frames.
+  (*parallel_frame_count)++;
+  if (*parallel_frame_count > max_parallel_frames) *parallel_frame_count = 1;
+}
+
+// This function sets gf_group->src_offset based on frame_parallel_level.
+// Outputs are gf_group->src_offset and first_frame_index
+static void set_src_offset(GF_GROUP *const gf_group, int *first_frame_index,
+                           int cur_frame_idx, int frame_ind) {
+  if (gf_group->frame_parallel_level[frame_ind] > 0) {
+    if (gf_group->frame_parallel_level[frame_ind] == 1) {
+      *first_frame_index = cur_frame_idx;
+    }
+
+    // Obtain the offset of the frame at frame_ind in the lookahead queue by
+    // subtracting the display order hints of the current frame from the display
+    // order hint of the first frame in parallel encoding set (at
+    // first_frame_index).
+    gf_group->src_offset[frame_ind] =
+        (cur_frame_idx + gf_group->arf_src_offset[frame_ind]) -
+        *first_frame_index;
+  }
+}
+
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+// Sets the GF_GROUP params for LF_UPDATE frames.
+static AOM_INLINE void set_params_for_leaf_frames(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int *cur_disp_index,
+    int layer_depth, int start, int end) {
+  gf_group->update_type[*frame_ind] = LF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, layer_depth);
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                         end - start, 0, NULL, NULL, 0);
+  ++(*cur_disp_index);
+
+  // Set the level of parallelism for the LF_UPDATE frame.
+  if (do_frame_parallel_encode) {
+    set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                             parallel_frame_count, max_parallel_frames);
+    // Set LF_UPDATE frames as non-reference frames.
+    gf_group->is_frame_non_ref[*frame_ind] = 1;
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_OVERLAY_UPDATE frames.
+static AOM_INLINE void set_params_for_intnl_overlay_frames(
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *first_frame_index, int *cur_disp_index, int layer_depth) {
+  gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = 0;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] = (*cur_disp_index);
+  ++(*cur_disp_index);
+
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+  ++(*cur_frame_idx);
+}
+
+// Sets the GF_GROUP params for INTNL_ARF_UPDATE frames.
+static AOM_INLINE void set_params_for_internal_arfs(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, int *cur_frame_idx, int *frame_ind,
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index, int depth_thr,
+    int *cur_disp_idx, int layer_depth, int arf_src_offset, int offset,
+    int f_frames, int b_frames) {
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = arf_src_offset;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+  gf_group->arf_boost[*frame_ind] =
+      av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, offset,
+                         f_frames, b_frames, NULL, NULL, 0);
+
+  if (do_frame_parallel_encode) {
+    if (depth_thr != INT_MAX) {
+      assert(depth_thr == 3 || depth_thr == 4);
+      assert(IMPLIES(depth_thr == 3, layer_depth == 4));
+      assert(IMPLIES(depth_thr == 4, layer_depth == 5));
+      // Set frame_parallel_level of the first frame in the given layer to 1.
+      if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+        gf_group->frame_parallel_level[*frame_ind] = 1;
+      } else {
+        // Set frame_parallel_level of the consecutive frame in the same given
+        // layer to 2.
+        assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+        // Store the display order hints of the past 2 INTNL_ARF_UPDATE
+        // frames which would not have been displayed at the time of the encode
+        // of current frame.
+        gf_group->skip_frame_refresh[*frame_ind][0] =
+            gf_group->display_idx[(*frame_ind) - 1];
+        gf_group->skip_frame_refresh[*frame_ind][1] =
+            gf_group->display_idx[(*frame_ind) - 2];
+        // Set the display_idx of frame_parallel_level 1 frame in
+        // gf_group->skip_frame_as_ref.
+        gf_group->skip_frame_as_ref[*frame_ind] =
+            gf_group->display_idx[(*frame_ind) - 1];
+      }
+    }
+    // If max_parallel_frames is not exceeded and if the frame will not be
+    // temporally filtered, encode the next internal ARF frame in parallel.
+    if (*parallel_frame_count > 1 &&
+        *parallel_frame_count <= max_parallel_frames) {
+      if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+        gf_group->frame_parallel_level[*frame_ind] = 2;
+      *parallel_frame_count = 1;
+    }
+  }
+  set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+  ++(*frame_ind);
+}
+
 // Set parameters for frames between 'start' and 'end' (excluding both).
-static void set_multi_layer_params(const TWO_PASS *twopass,
-                                   GF_GROUP *const gf_group, RATE_CONTROL *rc,
-                                   FRAME_INFO *frame_info, int start, int end,
-                                   int *cur_frame_idx, int *frame_ind,
-                                   int layer_depth) {
+static void set_multi_layer_params_for_fp(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind, int *parallel_frame_count,
+    int max_parallel_frames, int do_frame_parallel_encode,
+    int *first_frame_index, int depth_thr, int *cur_disp_idx, int layer_depth) {
+  const int num_frames_to_process = end - start;
+
+  // Either we are at the last level of the pyramid, or we don't have enough
+  // frames between 'l' and 'r' to create one more level.
+  if (layer_depth > gf_group->max_layer_depth_allowed ||
+      num_frames_to_process < 3) {
+    // Leaf nodes.
+    while (start < end) {
+      set_params_for_leaf_frames(twopass, twopass_frame, p_rc, frame_info,
+                                 gf_group, cur_frame_idx, frame_ind,
+                                 parallel_frame_count, max_parallel_frames,
+                                 do_frame_parallel_encode, first_frame_index,
+                                 cur_disp_idx, layer_depth, start, end);
+      ++start;
+    }
+  } else {
+    const int m = (start + end - 1) / 2;
+
+    // Internal ARF.
+    int arf_src_offset = m - start;
+    set_params_for_internal_arfs(
+        twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+        frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, INT_MAX, cur_disp_idx,
+        layer_depth, arf_src_offset, m, end - m, m - start);
+
+    // If encode reordering is enabled, configure the multi-layers accordingly
+    // and return. For e.g., the encode order for gf-interval 16 after
+    // reordering would be 0-> 16-> 8-> 4-> 2-> 6-> 1-> 3-> 5-> 7-> 12-> 10->
+    // 14-> 9-> 11-> 13-> 15.
+    if (layer_depth >= depth_thr) {
+      int m1 = (m + start - 1) / 2;
+      int m2 = (m + 1 + end) / 2;
+      int arf_src_offsets[2] = { m1 - start, m2 - start };
+      // Parameters to compute arf_boost.
+      int offset[2] = { m1, m2 };
+      int f_frames[2] = { m - m1, end - m2 };
+      int b_frames[2] = { m1 - start, m2 - (m + 1) };
+
+      // Set GF_GROUP params for INTNL_ARF_UPDATE frames which are reordered.
+      for (int i = 0; i < 2; i++) {
+        set_params_for_internal_arfs(
+            twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+            frame_ind, parallel_frame_count, max_parallel_frames,
+            do_frame_parallel_encode, first_frame_index, depth_thr,
+            cur_disp_idx, layer_depth + 1, arf_src_offsets[i], offset[i],
+            f_frames[i], b_frames[i]);
+      }
+
+      // Initialize the start and end indices to configure LF_UPDATE frames.
+      int start_idx[4] = { start, m1 + 1, m + 1, end - 1 };
+      int end_idx[4] = { m1, m, m2, end };
+      int layer_depth_for_intnl_overlay[4] = { layer_depth + 1, layer_depth,
+                                               layer_depth + 1, INVALID_IDX };
+
+      // Set GF_GROUP params for the rest of LF_UPDATE and INTNL_OVERLAY_UPDATE
+      // frames after reordering.
+      for (int i = 0; i < 4; i++) {
+        set_multi_layer_params_for_fp(
+            twopass, twopass_frame, gf_group, p_rc, rc, frame_info,
+            start_idx[i], end_idx[i], cur_frame_idx, frame_ind,
+            parallel_frame_count, max_parallel_frames, do_frame_parallel_encode,
+            first_frame_index, depth_thr, cur_disp_idx, layer_depth + 2);
+        if (layer_depth_for_intnl_overlay[i] != INVALID_IDX)
+          set_params_for_intnl_overlay_frames(
+              gf_group, cur_frame_idx, frame_ind, first_frame_index,
+              cur_disp_idx, layer_depth_for_intnl_overlay[i]);
+      }
+      return;
+    }
+
+    // Frames displayed before this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, start, m,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+
+    // Overlay for internal ARF.
+    set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                        first_frame_index, cur_disp_idx,
+                                        layer_depth);
+
+    // Frames displayed after this internal ARF.
+    set_multi_layer_params_for_fp(
+        twopass, twopass_frame, gf_group, p_rc, rc, frame_info, m + 1, end,
+        cur_frame_idx, frame_ind, parallel_frame_count, max_parallel_frames,
+        do_frame_parallel_encode, first_frame_index, depth_thr, cur_disp_idx,
+        layer_depth + 1);
+  }
+}
+
+// Structure for bookkeeping start, end and display indices to configure
+// INTNL_ARF_UPDATE frames.
+typedef struct {
+  int start;
+  int end;
+  int display_index;
+} FRAME_REORDER_INFO;
+
+// Updates the stats required to configure the GF_GROUP.
+static AOM_INLINE void fill_arf_frame_stats(FRAME_REORDER_INFO *arf_frame_stats,
+                                            int arf_frame_index,
+                                            int display_idx, int start,
+                                            int end) {
+  arf_frame_stats[arf_frame_index].start = start;
+  arf_frame_stats[arf_frame_index].end = end;
+  arf_frame_stats[arf_frame_index].display_index = display_idx;
+}
+
+// Sets GF_GROUP params for INTNL_ARF_UPDATE frames. Also populates
+// doh_gf_index_map and arf_frame_stats.
+static AOM_INLINE void set_params_for_internal_arfs_in_gf14(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int start, int end,
+    int layer_depth, int layer_with_parallel_encodes) {
+  int index = (start + end - 1) / 2;
+  gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+  gf_group->arf_src_offset[*frame_ind] = index - 1;
+  gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
+  gf_group->layer_depth[*frame_ind] = layer_depth;
+  gf_group->frame_type[*frame_ind] = INTER_FRAME;
+  gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+  gf_group->display_idx[*frame_ind] =
+      (*cur_disp_idx) + gf_group->arf_src_offset[*frame_ind];
+
+  // Update the display index of the current frame with its gf index.
+  doh_gf_index_map[index] = *frame_ind;
+  if (layer_with_parallel_encodes) {
+    assert(layer_depth == 4);
+    // Set frame_parallel_level of the first frame in the given layer depth
+    // to 1.
+    if (gf_group->layer_depth[(*frame_ind) - 1] != layer_depth) {
+      gf_group->frame_parallel_level[*frame_ind] = 1;
+    } else {
+      // Set frame_parallel_level of the consecutive frame in the same given
+      // layer depth to 2.
+      assert(gf_group->frame_parallel_level[(*frame_ind) - 1] == 1);
+      gf_group->frame_parallel_level[*frame_ind] = 2;
+      // Set the display_idx of frame_parallel_level 1 frame in
+      // gf_group->skip_frame_as_ref.
+      gf_group->skip_frame_as_ref[*frame_ind] =
+          gf_group->display_idx[(*frame_ind) - 1];
+    }
+  }
+  ++(*frame_ind);
+
+  // Update arf_frame_stats.
+  fill_arf_frame_stats(arf_frame_stats, *count_arf_frames, index, start, end);
+  ++(*count_arf_frames);
+}
+
+// Sets GF_GROUP params for all INTNL_ARF_UPDATE frames in the given layer
+// dpeth.
+static AOM_INLINE void set_params_for_cur_layer_frames(
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *cur_disp_idx, int *frame_ind,
+    int *count_arf_frames, int *doh_gf_index_map, int num_dir, int node_start,
+    int node_end, int layer_depth) {
+  assert(num_dir < 3);
+  int start, end;
+  // Iterate through the nodes in the previous layer depth.
+  for (int i = node_start; i < node_end; i++) {
+    // For each node, check if a frame can be coded as INTNL_ARF_UPDATE frame on
+    // either direction.
+    for (int dir = 0; dir < num_dir; dir++) {
+      // Checks for a frame to the left of current node.
+      if (dir == 0) {
+        start = arf_frame_stats[i].start;
+        end = arf_frame_stats[i].display_index;
+      } else {
+        // Checks for a frame to the right of current node.
+        start = arf_frame_stats[i].display_index + 1;
+        end = arf_frame_stats[i].end;
+      }
+      const int num_frames_to_process = end - start;
+      // Checks if a frame can be coded as INTNL_ARF_UPDATE frame. If
+      // num_frames_to_process is less than 3, then there are not enough frames
+      // between 'start' and 'end' to create another level.
+      if (num_frames_to_process >= 3) {
+        // Flag to indicate the lower layer depths for which parallel encoding
+        // is enabled. Currently enabled for layer 4 frames.
+        int layer_with_parallel_encodes = layer_depth == 4;
+        set_params_for_internal_arfs_in_gf14(
+            gf_group, arf_frame_stats, cur_frame_idx, cur_disp_idx, frame_ind,
+            count_arf_frames, doh_gf_index_map, start, end, layer_depth,
+            layer_with_parallel_encodes);
+      }
+    }
+  }
+}
+
+// Configures multi-layers of the GF_GROUP when consecutive encode of frames in
+// the same layer depth is enbaled.
+static AOM_INLINE void set_multi_layer_params_for_gf14(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+    GF_GROUP *const gf_group, FRAME_REORDER_INFO *arf_frame_stats,
+    int *cur_frame_idx, int *frame_ind, int *count_arf_frames,
+    int *doh_gf_index_map, int *parallel_frame_count, int *first_frame_index,
+    int *cur_disp_index, int gf_interval, int layer_depth,
+    int max_parallel_frames) {
+  assert(layer_depth == 2);
+  assert(gf_group->max_layer_depth_allowed >= 4);
+  int layer, node_start, node_end = 0;
+  // Maximum layer depth excluding LF_UPDATE frames is 4 since applicable only
+  // for gf-interval 14.
+  const int max_layer_depth = 4;
+  // Iterate through each layer depth starting from 2 till 'max_layer_depth'.
+  for (layer = layer_depth; layer <= max_layer_depth; layer++) {
+    // 'node_start' and 'node_end' indicate the number of nodes from the
+    // previous layer depth to be considered. It also corresponds to the indices
+    // of arf_frame_stats.
+    node_start = node_end;
+    node_end = (*count_arf_frames);
+    // 'num_dir' indicates the number of directions to traverse w.r.t. a given
+    // node in order to choose an INTNL_ARF_UPDATE frame. Layer depth 2 would
+    // have only one frame and hence needs to traverse only in the left
+    // direction w.r.t the node in the previous layer.
+    int num_dir = layer == 2 ? 1 : 2;
+    set_params_for_cur_layer_frames(gf_group, arf_frame_stats, cur_frame_idx,
+                                    cur_disp_index, frame_ind, count_arf_frames,
+                                    doh_gf_index_map, num_dir, node_start,
+                                    node_end, layer);
+  }
+
+  for (int i = 1; i < gf_interval; i++) {
+    // Since doh_gf_index_map is already populated for all INTNL_ARF_UPDATE
+    // frames in the GF_GROUP, any frame with INVALID_IDX would correspond to an
+    // LF_UPDATE frame.
+    if (doh_gf_index_map[i] == INVALID_IDX) {
+      // LF_UPDATE frames.
+      // TODO(Remya): Correct start and end parameters passed to
+      // set_params_for_leaf_frames() once encode reordering for gf-interval 14
+      // is enbaled for parallel encode of lower layer frames.
+      set_params_for_leaf_frames(
+          twopass, twopass_frame, p_rc, frame_info, gf_group, cur_frame_idx,
+          frame_ind, parallel_frame_count, max_parallel_frames, 1,
+          first_frame_index, cur_disp_index, layer, 0, 0);
+    } else {
+      // In order to obtain the layer depths of INTNL_OVERLAY_UPDATE frames, get
+      // the gf index of corresponding INTNL_ARF_UPDATE frames.
+      int intnl_arf_index = doh_gf_index_map[i];
+      int ld = gf_group->layer_depth[intnl_arf_index];
+      set_params_for_intnl_overlay_frames(gf_group, cur_frame_idx, frame_ind,
+                                          first_frame_index, cur_disp_index,
+                                          ld);
+    }
+  }
+}
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(
+    const TWO_PASS *twopass, const TWO_PASS_FRAME *twopass_frame,
+    GF_GROUP *const gf_group, const PRIMARY_RATE_CONTROL *p_rc,
+    RATE_CONTROL *rc, FRAME_INFO *frame_info, int start, int end,
+    int *cur_frame_idx, int *frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    int *parallel_frame_count, int max_parallel_frames,
+    int do_frame_parallel_encode, int *first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    int layer_depth) {
   const int num_frames_to_process = end - start;
 
   // Either we are at the last level of the pyramid, or we don't have enough
@@ -44,12 +456,23 @@
       gf_group->arf_src_offset[*frame_ind] = 0;
       gf_group->cur_frame_idx[*frame_ind] = *cur_frame_idx;
       gf_group->layer_depth[*frame_ind] = MAX_ARF_LAYERS;
-      gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-          twopass, rc, frame_info, start, end - start, 0, NULL, NULL);
+      gf_group->arf_boost[*frame_ind] =
+          av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, start,
+                             end - start, 0, NULL, NULL, 0);
       gf_group->frame_type[*frame_ind] = INTER_FRAME;
       gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
       gf_group->max_layer_depth =
           AOMMAX(gf_group->max_layer_depth, layer_depth);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      // Set the level of parallelism for the LF_UPDATE frame.
+      if (do_frame_parallel_encode) {
+        set_frame_parallel_level(&gf_group->frame_parallel_level[*frame_ind],
+                                 parallel_frame_count, max_parallel_frames);
+        // Set LF_UPDATE frames as non-reference frames.
+        gf_group->is_frame_non_ref[*frame_ind] = 1;
+      }
+      set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       ++(*frame_ind);
       ++(*cur_frame_idx);
       ++start;
@@ -65,14 +488,34 @@
     gf_group->frame_type[*frame_ind] = INTER_FRAME;
     gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (do_frame_parallel_encode) {
+      // If max_parallel_frames is not exceeded and if the frame will not be
+      // temporally filtered, encode the next internal ARF frame in parallel.
+      if (*parallel_frame_count > 1 &&
+          *parallel_frame_count <= max_parallel_frames) {
+        if (gf_group->arf_src_offset[*frame_ind] < TF_LOOKAHEAD_IDX_THR)
+          gf_group->frame_parallel_level[*frame_ind] = 2;
+        *parallel_frame_count = 1;
+      }
+    }
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
     // Get the boost factor for intermediate ARF frames.
-    gf_group->arf_boost[*frame_ind] = av1_calc_arf_boost(
-        twopass, rc, frame_info, m, end - m, m - start, NULL, NULL);
+    gf_group->arf_boost[*frame_ind] =
+        av1_calc_arf_boost(twopass, twopass_frame, p_rc, frame_info, m, end - m,
+                           m - start, NULL, NULL, 0);
     ++(*frame_ind);
 
     // Frames displayed before this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, start, m,
-                           cur_frame_idx, frame_ind, layer_depth + 1);
+    set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+                           frame_info, start, m, cur_frame_idx, frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           layer_depth + 1);
 
     // Overlay for internal ARF.
     gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
@@ -82,12 +525,21 @@
     gf_group->layer_depth[*frame_ind] = layer_depth;
     gf_group->frame_type[*frame_ind] = INTER_FRAME;
     gf_group->refbuf_state[*frame_ind] = REFBUF_UPDATE;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    set_src_offset(gf_group, first_frame_index, *cur_frame_idx, *frame_ind);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++(*frame_ind);
     ++(*cur_frame_idx);
 
     // Frames displayed after this internal ARF.
-    set_multi_layer_params(twopass, gf_group, rc, frame_info, m + 1, end,
-                           cur_frame_idx, frame_ind, layer_depth + 1);
+    set_multi_layer_params(twopass, twopass_frame, gf_group, p_rc, rc,
+                           frame_info, m + 1, end, cur_frame_idx, frame_ind,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           parallel_frame_count, max_parallel_frames,
+                           do_frame_parallel_encode, first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           layer_depth + 1);
   }
 }
 
@@ -95,71 +547,220 @@
     AV1_COMP *cpi, TWO_PASS *twopass, GF_GROUP *const gf_group,
     RATE_CONTROL *rc, FRAME_INFO *const frame_info, int gf_interval,
     FRAME_UPDATE_TYPE first_frame_update_type) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int frame_index = 0;
   int cur_frame_index = 0;
 
-  // Keyframe / Overlay frame / Golden frame.
-  assert(first_frame_update_type == KF_UPDATE ||
-         first_frame_update_type == OVERLAY_UPDATE ||
-         first_frame_update_type == GF_UPDATE);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Set the display order hint for the first frame in the GF_GROUP.
+  int cur_disp_index = (first_frame_update_type == KF_UPDATE)
+                           ? 0
+                           : cpi->common.current_frame.frame_number;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
-  if (first_frame_update_type == KF_UPDATE &&
-      cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1) {
-    gf_group->update_type[frame_index] = ARF_UPDATE;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Initialize gf_group->frame_parallel_level and gf_group->is_frame_non_ref to
+  // 0.
+  memset(
+      gf_group->frame_parallel_level, 0,
+      sizeof(gf_group->frame_parallel_level[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->is_frame_non_ref, 0,
+         sizeof(gf_group->is_frame_non_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+  memset(gf_group->src_offset, 0,
+         sizeof(gf_group->src_offset[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  // Initialize gf_group->skip_frame_refresh and gf_group->skip_frame_as_ref
+  // with INVALID_IDX.
+  memset(gf_group->skip_frame_refresh, INVALID_IDX,
+         sizeof(gf_group->skip_frame_refresh[0][0]) *
+             MAX_STATIC_GF_GROUP_LENGTH * REF_FRAMES);
+  memset(gf_group->skip_frame_as_ref, INVALID_IDX,
+         sizeof(gf_group->skip_frame_as_ref[0]) * MAX_STATIC_GF_GROUP_LENGTH);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+  const int kf_decomp = cpi->oxcf.kf_cfg.enable_keyframe_filtering > 1;
+  if (first_frame_update_type == KF_UPDATE) {
+    gf_group->update_type[frame_index] = kf_decomp ? ARF_UPDATE : KF_UPDATE;
     gf_group->arf_src_offset[frame_index] = 0;
     gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 0;
     gf_group->frame_type[frame_index] = KEY_FRAME;
     gf_group->refbuf_state[frame_index] = REFBUF_RESET;
     gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    if (!kf_decomp) cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++frame_index;
 
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    if (kf_decomp) {
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->arf_src_offset[frame_index] = 0;
+      gf_group->cur_frame_idx[frame_index] = cur_frame_index;
+      gf_group->layer_depth[frame_index] = 0;
+      gf_group->frame_type[frame_index] = INTER_FRAME;
+      gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
+      gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+      ++frame_index;
+    }
+    cur_frame_index++;
+  }
+
+  if (first_frame_update_type == GF_UPDATE) {
+    gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->arf_src_offset[frame_index] = 0;
     gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 0;
     gf_group->frame_type[frame_index] = INTER_FRAME;
     gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
     gf_group->max_layer_depth = 0;
-    ++frame_index;
-    cur_frame_index++;
-  } else if (first_frame_update_type != OVERLAY_UPDATE) {
-    gf_group->update_type[frame_index] = first_frame_update_type;
-    gf_group->arf_src_offset[frame_index] = 0;
-    gf_group->cur_frame_idx[frame_index] = cur_frame_index;
-    gf_group->layer_depth[frame_index] =
-        first_frame_update_type == OVERLAY_UPDATE ? MAX_ARF_LAYERS + 1 : 0;
-    gf_group->frame_type[frame_index] =
-        (first_frame_update_type == KF_UPDATE) ? KEY_FRAME : INTER_FRAME;
-    gf_group->refbuf_state[frame_index] =
-        (first_frame_update_type == KF_UPDATE) ? REFBUF_RESET : REFBUF_UPDATE;
-    gf_group->max_layer_depth = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+    cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++frame_index;
     ++cur_frame_index;
   }
 
   // ALTREF.
   const int use_altref = gf_group->max_layer_depth_allowed > 0;
-  int is_fwd_kf = (gf_interval == cpi->rc.frames_to_key);
+  int is_fwd_kf = rc->frames_to_fwd_kf == gf_interval;
+
   if (use_altref) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
     gf_group->arf_src_offset[frame_index] = gf_interval - cur_frame_index;
     gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = 1;
-    gf_group->arf_boost[frame_index] = cpi->rc.gfu_boost;
+    gf_group->arf_boost[frame_index] = cpi->ppi->p_rc.gfu_boost;
     gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
     gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
     gf_group->max_layer_depth = 1;
     gf_group->arf_index = frame_index;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] =
+        cur_disp_index + gf_group->arf_src_offset[frame_index];
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++frame_index;
   } else {
     gf_group->arf_index = -1;
   }
 
+  // Flag to indicate if multi-layer configuration is complete.
+  int is_multi_layer_configured = 0;
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Running count of no. of frames that is part of a given parallel
+  // encode set in a gf_group. Value of 1 indicates no parallel encode.
+  int parallel_frame_count = 1;
+  // Enable parallel encode of frames if gf_group has a multi-layer pyramid
+  // structure with minimum 4 layers.
+  int do_frame_parallel_encode = (cpi->ppi->num_fp_contexts > 1 && use_altref &&
+                                  gf_group->max_layer_depth_allowed >= 4);
+
+  int first_frame_index = cur_frame_index;
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+  if (do_frame_parallel_encode) {
+    // construct_multi_layer_gf_structure() takes the input parameter
+    // 'gf_interval' as p_rc->baseline_gf_interval - 1 . Below code computes the
+    // actual GF_GROUP length by compensating for this offset.
+    int actual_gf_length = ((first_frame_update_type == KF_UPDATE) ||
+                            (first_frame_update_type == GF_UPDATE))
+                               ? gf_interval
+                               : gf_interval + 1;
+
+    // In order to facilitate parallel encoding of frames in lower layer depths,
+    // encode reordering is done. Currently encode reordering is enabled only
+    // for gf-intervals 16 and 32. NOTE: Since the buffer holding the
+    // reference frames is of size 8 (ref_frame_map[REF_FRAMES]), there is a
+    // limitation on the number of hidden frames possible at any given point and
+    // hence the reordering is enabled only for gf-intervals 16 and 32.
+    // Disabling encode reordering for gf-interval 14 since some cross-frame
+    // dependencies related to temporal filtering for FPMT is currently not
+    // handled.
+    int disable_gf14_reorder = 1;
+    if (actual_gf_length == 14 && !disable_gf14_reorder) {
+      // This array holds the gf index of INTNL_ARF_UPDATE frames in the slot
+      // corresponding to their display order hint. This is used while
+      // configuring the LF_UPDATE frames and INTNL_OVERLAY_UPDATE frames.
+      int doh_gf_index_map[FIXED_GF_INTERVAL];
+      // Initialize doh_gf_index_map with INVALID_IDX.
+      memset(&doh_gf_index_map[0], INVALID_IDX,
+             (sizeof(doh_gf_index_map[0]) * FIXED_GF_INTERVAL));
+
+      FRAME_REORDER_INFO arf_frame_stats[REF_FRAMES - 1];
+      // Store the stats corresponding to layer 1 frame.
+      fill_arf_frame_stats(arf_frame_stats, 0, actual_gf_length, 1,
+                           actual_gf_length);
+      int count_arf_frames = 1;
+
+      // Sets multi-layer params for gf-interval 14 to consecutively encode
+      // frames in the same layer depth, i.e., encode order would be 0-> 14->
+      // 7-> 3-> 10-> 5-> 12-> 1-> 2-> 4-> 6-> 8-> 9-> 11-> 13.
+      // TODO(Remya): Set GF_GROUP param 'arf_boost' for all frames.
+      set_multi_layer_params_for_gf14(
+          twopass, &cpi->twopass_frame, p_rc, frame_info, gf_group,
+          arf_frame_stats, &cur_frame_index, &frame_index, &count_arf_frames,
+          doh_gf_index_map, &parallel_frame_count, &first_frame_index,
+          &cur_disp_index, actual_gf_length, use_altref + 1,
+          cpi->ppi->num_fp_contexts);
+
+      // Set gf_group->skip_frame_refresh.
+      for (int i = 0; i < actual_gf_length; i++) {
+        int count = 0;
+        if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+          for (int j = 0; j < i; j++) {
+            // Store the display order hint of the frames which would not
+            // have been displayed at the encode call of frame 'i'.
+            if ((gf_group->display_idx[j] < gf_group->display_idx[i]) &&
+                gf_group->update_type[j] == INTNL_ARF_UPDATE) {
+              gf_group->skip_frame_refresh[i][count++] =
+                  gf_group->display_idx[j];
+            }
+          }
+        }
+      }
+    } else {
+      // Set layer depth threshold for reordering as per the gf length.
+      int depth_thr =
+          (actual_gf_length == 16) ? 3 : (actual_gf_length == 32) ? 4 : INT_MAX;
+
+      set_multi_layer_params_for_fp(
+          twopass, &cpi->twopass_frame, gf_group, p_rc, rc, frame_info,
+          cur_frame_index, gf_interval, &cur_frame_index, &frame_index,
+          &parallel_frame_count, cpi->ppi->num_fp_contexts,
+          do_frame_parallel_encode, &first_frame_index, depth_thr,
+          &cur_disp_index, use_altref + 1);
+    }
+    is_multi_layer_configured = 1;
+  }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
   // Rest of the frames.
-  set_multi_layer_params(twopass, gf_group, rc, frame_info, cur_frame_index,
-                         gf_interval, &cur_frame_index, &frame_index,
-                         use_altref + 1);
+  if (!is_multi_layer_configured)
+    set_multi_layer_params(twopass, &cpi->twopass_frame, gf_group, p_rc, rc,
+                           frame_info, cur_frame_index, gf_interval,
+                           &cur_frame_index, &frame_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                           &parallel_frame_count, cpi->ppi->num_fp_contexts,
+                           do_frame_parallel_encode, &first_frame_index,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                           use_altref + 1);
 
   if (use_altref) {
     gf_group->update_type[frame_index] = OVERLAY_UPDATE;
@@ -167,9 +768,14 @@
     gf_group->cur_frame_idx[frame_index] = cur_frame_index;
     gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS;
     gf_group->arf_boost[frame_index] = NORMAL_BOOST;
-    gf_group->frame_type[frame_index] = is_fwd_kf ? KEY_FRAME : INTER_FRAME;
+    gf_group->frame_type[frame_index] = INTER_FRAME;
     gf_group->refbuf_state[frame_index] =
         is_fwd_kf ? REFBUF_RESET : REFBUF_UPDATE;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+    gf_group->display_idx[frame_index] = cur_disp_index;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
     ++frame_index;
   } else {
     for (; cur_frame_index <= gf_interval; ++cur_frame_index) {
@@ -181,25 +787,97 @@
       gf_group->frame_type[frame_index] = INTER_FRAME;
       gf_group->refbuf_state[frame_index] = REFBUF_UPDATE;
       gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      set_src_offset(gf_group, &first_frame_index, cur_frame_index,
+                     frame_index);
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+      gf_group->display_idx[frame_index] = cur_disp_index;
+      cur_disp_index++;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
       ++frame_index;
     }
   }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (do_frame_parallel_encode) {
+    // Iterate through the gf_group and reset frame_parallel_level to 0 in case
+    // a frame is marked as frame_parallel_level 1 with no subsequent
+    // frame_parallel_level 2 frame(s).
+    int level1_frame_idx = INT_MAX;
+    int level2_frame_count = 0;
+    for (int frame_idx = 0; frame_idx < frame_index; frame_idx++) {
+      if (gf_group->frame_parallel_level[frame_idx] == 1) {
+        // Set frame_parallel_level to 0 if only one frame is present in a
+        // parallel encode set.
+        if (level1_frame_idx != INT_MAX && !level2_frame_count)
+          gf_group->frame_parallel_level[level1_frame_idx] = 0;
+        // Book-keep frame_idx of frame_parallel_level 1 frame and reset the
+        // count of frame_parallel_level 2 frames in the corresponding parallel
+        // encode set.
+        level1_frame_idx = frame_idx;
+        level2_frame_count = 0;
+      }
+      if (gf_group->frame_parallel_level[frame_idx] == 2) level2_frame_count++;
+    }
+    // If frame_parallel_level is set to 1 for the last LF_UPDATE
+    // frame in the gf_group, reset it to zero since there are no subsequent
+    // frames in the gf_group.
+    if (gf_group->frame_parallel_level[frame_index - 2] == 1) {
+      assert(gf_group->update_type[frame_index - 2] == LF_UPDATE);
+      gf_group->frame_parallel_level[frame_index - 2] = 0;
+    }
+  }
+#endif
+
+  for (int gf_idx = frame_index; gf_idx < MAX_STATIC_GF_GROUP_LENGTH;
+       ++gf_idx) {
+    gf_group->update_type[gf_idx] = LF_UPDATE;
+    gf_group->arf_src_offset[gf_idx] = 0;
+    gf_group->cur_frame_idx[gf_idx] = gf_idx;
+    gf_group->layer_depth[gf_idx] = MAX_ARF_LAYERS;
+    gf_group->arf_boost[gf_idx] = NORMAL_BOOST;
+    gf_group->frame_type[gf_idx] = INTER_FRAME;
+    gf_group->refbuf_state[gf_idx] = REFBUF_UPDATE;
+    gf_group->max_layer_depth = AOMMAX(gf_group->max_layer_depth, 2);
+  }
+
   return frame_index;
 }
 
+static void set_ld_layer_depth(GF_GROUP *gf_group, int gop_length) {
+  int log_gop_length = 0;
+  while ((1 << log_gop_length) < gop_length) {
+    ++log_gop_length;
+  }
+
+  for (int gf_index = 0; gf_index < gf_group->size; ++gf_index) {
+    int count = 0;
+    // Find the trailing zeros
+    for (; count < MAX_ARF_LAYERS; ++count) {
+      if ((gf_index >> count) & 0x01) break;
+    }
+    gf_group->layer_depth[gf_index] = AOMMAX(log_gop_length - count, 0);
+  }
+}
+
 void av1_gop_setup_structure(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   const int key_frame = rc->frames_since_key == 0;
-  const FRAME_UPDATE_TYPE first_frame_update_type =
-      key_frame
-          ? KF_UPDATE
-          : cpi->gf_state.arf_gf_boost_lst || (rc->baseline_gf_interval == 1)
-                ? OVERLAY_UPDATE
-                : GF_UPDATE;
+  FRAME_UPDATE_TYPE first_frame_update_type = ARF_UPDATE;
+
+  if (key_frame)
+    first_frame_update_type = KF_UPDATE;
+  else if (!cpi->ppi->gf_state.arf_gf_boost_lst)
+    first_frame_update_type = GF_UPDATE;
+
   gf_group->size = construct_multi_layer_gf_structure(
-      cpi, twopass, gf_group, rc, frame_info, rc->baseline_gf_interval - 1,
+      cpi, twopass, gf_group, rc, frame_info, p_rc->baseline_gf_interval - 1,
       first_frame_update_type);
+
+  if (gf_group->max_layer_depth_allowed == 0)
+    set_ld_layer_depth(gf_group, p_rc->baseline_gf_interval);
 }

diff --git a/av1/encoder/gop_structure.h b/av1/encoder/gop_structure.h
index 6cfca22..3d37e2b 100644
--- a/av1/encoder/gop_structure.h
+++ b/av1/encoder/gop_structure.h

@@ -66,10 +66,12 @@
                             int64_t gf_group_bits);
 
 /*!\cond */
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
-                       FRAME_INFO *frame_info, int offset, int f_frames,
-                       int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required);
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost);
 /*!\endcond */
 
 #ifdef __cplusplus

diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 08c167a..eda5ddf 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c

@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "av1/common/idct.h"
+#include "av1/common/blockd.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
@@ -313,3 +314,26 @@
     default: assert(0); break;
   }
 }
+
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride,
+                    tran_low_t *coeff) {
+  if (use_hadamard) {
+    switch (tx_size) {
+      case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
+      case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
+      case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
+      case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
+      default: assert(0);
+    }
+  } else {
+    TxfmParam txfm_param;
+    txfm_param.tx_type = DCT_DCT;
+    txfm_param.tx_size = tx_size;
+    txfm_param.lossless = 0;
+    txfm_param.bd = bd_info.bit_depth;
+    txfm_param.is_hbd = bd_info.use_highbitdepth_buf;
+    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
+    av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
+  }
+}

diff --git a/av1/encoder/hybrid_fwd_txfm.h b/av1/encoder/hybrid_fwd_txfm.h
index daabc71..30f8a22 100644
--- a/av1/encoder/hybrid_fwd_txfm.h
+++ b/av1/encoder/hybrid_fwd_txfm.h

@@ -24,6 +24,15 @@
 void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                          int diff_stride, TxfmParam *txfm_param);
 
+/*!\brief Apply Hadamard or DCT transform
+ *
+ * \callergraph
+ * DCT and Hadamard transforms are commonly used for quick RD score estimation.
+ * The coeff buffer's size should be equal to the number of pixels
+ * corresponding to tx_size.
+ */
+void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, BitDepthInfo bd_info,
+                    const int16_t *src_diff, int src_stride, tran_low_t *coeff);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/interp_search.c b/av1/encoder/interp_search.c
index 6d4f4ec..755e6e1 100644
--- a/av1/encoder/interp_search.c
+++ b/av1/encoder/interp_search.c

@@ -178,7 +178,7 @@
   mbmi->interp_filters = filter_sets[filter_idx];
   const int tmp_rs =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
-                          cm->seq_params.enable_dual_filter);
+                          cm->seq_params->enable_dual_filter);
 
   int64_t min_rd = RDCOST(x->rdmult, tmp_rs, 0);
   if (min_rd > *rd) {
@@ -450,14 +450,13 @@
 
   if (cpi->sf.interp_sf.adaptive_interp_filter_search == 2) {
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
     const int ctx0 = av1_get_pred_context_switchable_interp(xd, 0);
     const int ctx1 = av1_get_pred_context_switchable_interp(xd, 1);
     const int *switchable_interp_p0 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx0];
+        cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx0];
     const int *switchable_interp_p1 =
-        cpi->frame_probs.switchable_interp_probs[update_type][ctx1];
-
+        cpi->ppi->frame_probs.switchable_interp_probs[update_type][ctx1];
     static const int thr[7] = { 0, 8, 8, 8, 8, 0, 8 };
     const int thresh = thr[update_type];
     for (i = 0; i < SWITCHABLE_FILTERS; i++) {
@@ -684,7 +683,7 @@
   switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
   *switchable_rate =
       get_switchable_rate(x, mbmi->interp_filters, switchable_ctx,
-                          cm->seq_params.enable_dual_filter);
+                          cm->seq_params->enable_dual_filter);
 
   // Do MC evaluation for default filter_type.
   // Luma MC
@@ -748,7 +747,7 @@
   restore_dst_buf(xd, *tmp_dst, num_planes);
   const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
   // Evaluate dual interp filters
-  if (cm->seq_params.enable_dual_filter) {
+  if (cm->seq_params->enable_dual_filter) {
     if (cpi->sf.interp_sf.use_fast_interpolation_filter_search) {
       fast_dual_interp_filter_rd(x, cpi, tile_data, bsize, orig_dst, rd,
                                  &rd_stats_luma, &rd_stats, switchable_rate,

diff --git a/av1/encoder/interp_search.h b/av1/encoder/interp_search.h
index 1ee26d1..902b699 100644
--- a/av1/encoder/interp_search.h
+++ b/av1/encoder/interp_search.h

@@ -37,7 +37,7 @@
 
 /*!\brief Miscellaneous arguments for inter mode search.
  */
-typedef struct {
+typedef struct HandleInterModeArgs {
   /*!
    * Buffer for the above predictor in OBMC
    */
@@ -139,6 +139,16 @@
    * Estimated cmp mode.
    */
   int cmp_mode[MODE_CTX_REF_FRAMES];
+  /*!
+   * The best sse during single new_mv search. Note that the sse here comes from
+   * single_motion_search, and not from interpolation_filter_search. This has
+   * two implications:
+   * 1. The mv used to calculate the sse here does not have to be the best sse
+   *    found in handle_inter_mode.
+   * 2. Even if the mvs agree, the sse here can differ from the sse in \ref
+   *    MACROBLOCK::pred_sse due to different interpolation filter used.
+   */
+  unsigned int best_single_sse_in_refs[REF_FRAMES];
 } HandleInterModeArgs;
 
 /*!\cond */

diff --git a/av1/encoder/intra_mode_search.c b/av1/encoder/intra_mode_search.c
index 807fa66..25a6e2c 100644
--- a/av1/encoder/intra_mode_search.c
+++ b/av1/encoder/intra_mode_search.c

@@ -32,6 +32,31 @@
   UV_D113_PRED,   UV_D45_PRED,
 };
 
+// The bitmask corresponds to the filter intra modes as defined in enums.h
+// FILTER_INTRA_MODE enumeration type. Setting a bit to 0 in the mask means to
+// disable the evaluation of corresponding filter intra mode. The table
+// av1_derived_filter_intra_mode_used_flag is used when speed feature
+// prune_filter_intra_level is 1. The evaluated filter intra modes are union
+// of the following:
+// 1) FILTER_DC_PRED
+// 2) mode that corresponds to best mode so far of DC_PRED, V_PRED, H_PRED,
+// D157_PRED and PAETH_PRED. (Eg: FILTER_V_PRED if best mode so far is V_PRED).
+static const uint8_t av1_derived_filter_intra_mode_used_flag[INTRA_MODES] = {
+  0x01,  // DC_PRED:           0000 0001
+  0x03,  // V_PRED:            0000 0011
+  0x05,  // H_PRED:            0000 0101
+  0x01,  // D45_PRED:          0000 0001
+  0x01,  // D135_PRED:         0000 0001
+  0x01,  // D113_PRED:         0000 0001
+  0x09,  // D157_PRED:         0000 1001
+  0x01,  // D203_PRED:         0000 0001
+  0x01,  // D67_PRED:          0000 0001
+  0x01,  // SMOOTH_PRED:       0000 0001
+  0x01,  // SMOOTH_V_PRED:     0000 0001
+  0x01,  // SMOOTH_H_PRED:     0000 0001
+  0x11   // PAETH_PRED:        0001 0001
+};
+
 // The bitmask corresponds to the chroma intra modes as defined in enums.h
 // UV_PREDICTION_MODE enumeration type. Setting a bit to 0 in the mask means to
 // disable the evaluation of corresponding chroma intra mode. The table
@@ -60,59 +85,6 @@
 };
 /*!\endcond */
 
-/*!\brief Calculate the rdcost of a given luma intra angle
- *
- * \ingroup intra_mode_search
- * \callergraph
- * This function runs rd calculation for a given luma intra prediction angle.
- * This is used to select the best angle delta.
- *
- * \return Returns the rdcost of the angle and updates the mbmi if the
- * new rdcost is better.
- */
-static int64_t calc_rd_given_intra_angle(
-    const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mode_cost,
-    int64_t best_rd_in, int8_t angle_delta, int max_angle_delta, int *rate,
-    RD_STATS *rd_stats, int *best_angle_delta, TX_SIZE *best_tx_size,
-    int64_t *best_rd, int64_t *best_model_rd, uint8_t *best_tx_type_map,
-    uint8_t *best_blk_skip, int skip_model_rd) {
-  RD_STATS tokenonly_rd_stats;
-  int64_t this_rd;
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  const int n4 = bsize_to_num_blk(bsize);
-  assert(!is_inter_block(mbmi));
-  mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
-  if (!skip_model_rd) {
-    if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
-      return INT64_MAX;
-    }
-  }
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    best_rd_in);
-  if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
-
-  int this_rate =
-      mode_cost + tokenonly_rd_stats.rate +
-      x->mode_costs
-          .angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
-  this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
-
-  if (this_rd < *best_rd) {
-    memcpy(best_blk_skip, x->txfm_search_info.blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(best_tx_type_map, xd->tx_type_map, n4);
-    *best_rd = this_rd;
-    *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
-    *best_tx_size = mbmi->tx_size;
-    *rate = this_rate;
-    rd_stats->rate = tokenonly_rd_stats.rate;
-    rd_stats->dist = tokenonly_rd_stats.dist;
-    rd_stats->skip_txfm = tokenonly_rd_stats.skip_txfm;
-  }
-  return this_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding intra frame.
  *
  * \ingroup intra_mode_search
@@ -125,8 +97,12 @@
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int mode_cost,
+                                    PREDICTION_MODE best_mode_so_far,
                                     int64_t *best_rd, int64_t *best_model_rd,
                                     PICK_MODE_CONTEXT *ctx) {
+  // Skip the evaluation of filter intra modes.
+  if (cpi->sf.intra_sf.prune_filter_intra_level == 2) return 0;
+
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   int filter_intra_selected_flag = 0;
@@ -150,6 +126,11 @@
     RD_STATS tokenonly_rd_stats;
     mbmi->filter_intra_mode_info.filter_intra_mode = mode;
 
+    if ((cpi->sf.intra_sf.prune_filter_intra_level == 1) &&
+        !(av1_derived_filter_intra_mode_used_flag[best_mode_so_far] &
+          (1 << mode)))
+      continue;
+
     // Skip the evaluation of modes that do not match with the winner mode in
     // x->mb_mode_cache.
     if (x->use_mb_mode_cache &&
@@ -259,6 +240,42 @@
   }
 }
 
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi) {
+  if (mode_idx < INTRA_MODE_END) {
+    mbmi->mode = intra_rd_search_mode_order[mode_idx];
+    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+  } else {
+    mbmi->mode = (mode_idx - INTRA_MODE_END) / (MAX_ANGLE_DELTA * 2) + V_PRED;
+    int angle_delta = (mode_idx - INTRA_MODE_END) % (MAX_ANGLE_DELTA * 2);
+    mbmi->angle_delta[PLANE_TYPE_Y] =
+        (angle_delta < 3 ? (angle_delta - 3) : (angle_delta - 2));
+  }
+}
+
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int model_cnt_allowed) {
+  const double thresh_best = 1.50;
+  const double thresh_top = 1.00;
+  for (int i = 0; i < model_cnt_allowed; i++) {
+    if (this_model_rd < top_intra_model_rd[i]) {
+      for (int j = model_cnt_allowed - 1; j > i; j--) {
+        top_intra_model_rd[j] = top_intra_model_rd[j - 1];
+      }
+      top_intra_model_rd[i] = this_model_rd;
+      break;
+    }
+  }
+  if (top_intra_model_rd[model_cnt_allowed - 1] != INT64_MAX &&
+      this_model_rd > thresh_top * top_intra_model_rd[model_cnt_allowed - 1])
+    return 1;
+
+  if (this_model_rd != INT64_MAX &&
+      this_model_rd > thresh_best * (*best_model_rd))
+    return 1;
+  if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+  return 0;
+}
+
 // Run RD calculation with given chroma intra prediction angle., and return
 // the RD cost. Update the best mode info. if the RD cost is the best so far.
 static int64_t pick_intra_angle_routine_sbuv(
@@ -388,7 +405,7 @@
   int64_t cfl_cost;
   if (fast_mode) {
     cfl_cost =
-        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hardamard=*/0);
+        intra_model_rd(cm, x, plane, plane_bsize, tx_size, /*use_hadamard=*/0);
   } else {
     av1_init_rd_stats(rd_stats);
     av1_txfm_rd_in_plane(x, cpi, rd_stats, INT64_MAX, 0, plane, plane_bsize,
@@ -404,8 +421,7 @@
 static void cfl_pick_plane_parameter(const AV1_COMP *const cpi, MACROBLOCK *x,
                                      int plane, TX_SIZE tx_size,
                                      int cfl_search_range,
-                                     int cfl_rate_arr[CFL_MAGS_SIZE],
-                                     int64_t cfl_dist_arr[CFL_MAGS_SIZE]) {
+                                     RD_STATS cfl_rd_arr[CFL_MAGS_SIZE]) {
   assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -443,27 +459,20 @@
   }
 
   for (int cfl_idx = 0; cfl_idx < CFL_MAGS_SIZE; ++cfl_idx) {
-    cfl_rate_arr[cfl_idx] = INT_MAX;
-    cfl_dist_arr[cfl_idx] = INT64_MAX;
+    av1_invalid_rd_stats(&cfl_rd_arr[cfl_idx]);
   }
 
   int fast_mode = 0;
   int start_cfl_idx = est_best_cfl_idx;
-  RD_STATS rd_stats;
   cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, start_cfl_idx, fast_mode,
-                 &rd_stats);
-  cfl_rate_arr[start_cfl_idx] = rd_stats.rate;
-  cfl_dist_arr[start_cfl_idx] = rd_stats.dist;
-  // TODO(angiebird): simplify this search loop.
+                 &cfl_rd_arr[start_cfl_idx]);
   for (int si = 0; si < 2; ++si) {
     const int dir = dir_ls[si];
     for (int i = 1; i < cfl_search_range; ++i) {
       int cfl_idx = start_cfl_idx + dir * i;
       if (cfl_idx < 0 || cfl_idx >= CFL_MAGS_SIZE) break;
       cfl_compute_rd(cpi, x, plane, tx_size, plane_bsize, cfl_idx, fast_mode,
-                     &rd_stats);
-      cfl_rate_arr[cfl_idx] = rd_stats.rate;
-      cfl_dist_arr[cfl_idx] = rd_stats.dist;
+                     &cfl_rd_arr[cfl_idx]);
     }
   }
   xd->cfl.use_dc_pred_cache = 0;
@@ -471,49 +480,56 @@
   xd->cfl.dc_pred_is_cached[1] = 0;
 }
 
-/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) compoenent
+/*!\brief Pick the optimal parameters for Chroma to Luma (CFL) component
  *
  * \ingroup intra_mode_search
  * \callergraph
  *
+ * This function will use DCT_DCT followed by computing SATD (sum of absolute
+ * transformed differences) to estimate the RD score and find the best possible
+ * CFL parameter.
+ *
+ * Then the function will apply a full RD search near the best possible CFL
+ * parameter to find the best actual CFL parameter.
+ *
  * Side effect:
  * We use ths buffers in x->plane[] and xd->plane[] as throw-away buffers for RD
  * search.
  *
- * \return CFL mode overhead
+ * \param[in] x                Encoder prediction block structure.
+ * \param[in] cpi              Top-level encoder instance structure.
+ * \param[in] tx_size          Transform size.
+ * \param[in] ref_best_rd      Reference best RD.
+ * \param[in] cfl_search_range The search range of full RD search near the
+ *                             estimated best CFL parameter.
+ *
+ * \param[out]   best_rd_stats          RD stats of the best CFL parameter
+ * \param[out]   best_cfl_alpha_idx     Best CFL alpha index
+ * \param[out]   best_cfl_alpha_signs   Best CFL joint signs
+ *
  */
 static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
                              TX_SIZE tx_size, int64_t ref_best_rd,
-                             int cfl_search_range, int *token_rate,
+                             int cfl_search_range, RD_STATS *best_rd_stats,
                              uint8_t *best_cfl_alpha_idx,
                              int8_t *best_cfl_alpha_signs) {
   assert(cfl_search_range >= 1 && cfl_search_range <= CFL_MAGS_SIZE);
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
   const ModeCosts *mode_costs = &x->mode_costs;
-  const int64_t mode_rd = RDCOST(
-      x->rdmult,
-      mode_costs->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
-  int cfl_rate_arr_u[CFL_MAGS_SIZE];
-  int cfl_rate_arr_v[CFL_MAGS_SIZE];
-  int64_t cfl_dist_arr_u[CFL_MAGS_SIZE];
-  int64_t cfl_dist_arr_v[CFL_MAGS_SIZE];
-  cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range, cfl_rate_arr_u,
-                           cfl_dist_arr_u);
-  cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range, cfl_rate_arr_v,
-                           cfl_dist_arr_v);
-  int64_t best_rd = ref_best_rd;
-  int best_token_rate = INT_MAX;
-  int best_joint_sign = 0;
-  int best_cfl_alpha_u = 0;
-  int best_cfl_alpha_v = 0;
+  RD_STATS cfl_rd_arr_u[CFL_MAGS_SIZE];
+  RD_STATS cfl_rd_arr_v[CFL_MAGS_SIZE];
+
+  av1_invalid_rd_stats(best_rd_stats);
+
+  cfl_pick_plane_parameter(cpi, x, 1, tx_size, cfl_search_range, cfl_rd_arr_u);
+  cfl_pick_plane_parameter(cpi, x, 2, tx_size, cfl_search_range, cfl_rd_arr_v);
+
   for (int ui = 0; ui < CFL_MAGS_SIZE; ++ui) {
-    if (cfl_rate_arr_u[ui] == INT_MAX) continue;
+    if (cfl_rd_arr_u[ui].rate == INT_MAX) continue;
     int cfl_alpha_u;
     CFL_SIGN_TYPE cfl_sign_u;
     cfl_idx_to_sign_and_alpha(ui, &cfl_sign_u, &cfl_alpha_u);
     for (int vi = 0; vi < CFL_MAGS_SIZE; ++vi) {
-      if (cfl_rate_arr_v[vi] == INT_MAX) continue;
+      if (cfl_rd_arr_v[vi].rate == INT_MAX) continue;
       int cfl_alpha_v;
       CFL_SIGN_TYPE cfl_sign_v;
       cfl_idx_to_sign_and_alpha(vi, &cfl_sign_v, &cfl_alpha_v);
@@ -521,38 +537,32 @@
       // valid parameter for CFL
       if (cfl_sign_u == CFL_SIGN_ZERO && cfl_sign_v == CFL_SIGN_ZERO) continue;
       int joint_sign = cfl_sign_u * CFL_SIGNS + cfl_sign_v - 1;
-
-      int64_t dist = cfl_dist_arr_u[ui] + cfl_dist_arr_v[vi];
-      int this_token_rate = cfl_rate_arr_u[ui] + cfl_rate_arr_v[vi];
-      int rate = this_token_rate;
-      rate += mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
-      rate += mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
-      int64_t this_rd = RDCOST(x->rdmult, rate, dist) + mode_rd;
-      if (this_rd < best_rd) {
-        best_token_rate = this_token_rate;
-        best_rd = this_rd;
-        best_joint_sign = joint_sign;
-        best_cfl_alpha_u = cfl_alpha_u;
-        best_cfl_alpha_v = cfl_alpha_v;
+      RD_STATS rd_stats = cfl_rd_arr_u[ui];
+      av1_merge_rd_stats(&rd_stats, &cfl_rd_arr_v[vi]);
+      if (rd_stats.rate != INT_MAX) {
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_U][cfl_alpha_u];
+        rd_stats.rate +=
+            mode_costs->cfl_cost[joint_sign][CFL_PRED_V][cfl_alpha_v];
+      }
+      av1_rd_cost_update(x->rdmult, &rd_stats);
+      if (rd_stats.rdcost < best_rd_stats->rdcost) {
+        *best_rd_stats = rd_stats;
+        *best_cfl_alpha_idx =
+            (cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + cfl_alpha_v;
+        *best_cfl_alpha_signs = joint_sign;
       }
     }
   }
-  int best_rate_overhead = INT_MAX;
-  if (best_rd < ref_best_rd) {
-    int mode_cost_u =
-        mode_costs->cfl_cost[best_joint_sign][CFL_PRED_U][best_cfl_alpha_u];
-    int mode_cost_v =
-        mode_costs->cfl_cost[best_joint_sign][CFL_PRED_V][best_cfl_alpha_v];
-    best_rate_overhead = mode_cost_u + mode_cost_v;
-    *best_cfl_alpha_idx =
-        (best_cfl_alpha_u << CFL_ALPHABET_SIZE_LOG2) + best_cfl_alpha_v;
-    *best_cfl_alpha_signs = best_joint_sign;
-    *token_rate = best_token_rate;
-  } else {
+  if (best_rd_stats->rdcost >= ref_best_rd) {
+    av1_invalid_rd_stats(best_rd_stats);
+    // Set invalid CFL parameters here since the rdcost is not better than
+    // ref_best_rd.
     *best_cfl_alpha_idx = 0;
     *best_cfl_alpha_signs = 0;
+    return 0;
   }
-  return best_rate_overhead;
+  return 1;
 }
 
 int64_t av1_rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -606,6 +616,9 @@
 
     if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
       continue;
+    if (is_directional_mode &&
+        !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+      continue;
 
     if (!(cpi->sf.intra_sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
           (1 << mode)))
@@ -624,22 +637,19 @@
     mbmi->uv_mode = mode;
 
     // Init variables for cfl and angle delta
-    int cfl_alpha_rate = 0;
-    int cfl_token_rate = INT_MAX;
     const SPEED_FEATURES *sf = &cpi->sf;
+    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
     if (mode == UV_CFL_PRED) {
       if (!is_cfl_allowed(xd) || !intra_mode_cfg->enable_cfl_intra) continue;
       assert(!is_directional_mode);
       const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
-      cfl_alpha_rate = cfl_rd_pick_alpha(
-          x, cpi, uv_tx_size, best_rd, sf->intra_sf.cfl_search_range,
-          &cfl_token_rate, &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs);
-      if (cfl_alpha_rate == INT_MAX) continue;
-    }
-    mbmi->angle_delta[PLANE_TYPE_UV] = 0;
-
-    if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
-        intra_mode_cfg->enable_angle_delta) {
+      if (!cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd,
+                             sf->intra_sf.cfl_search_range, &tokenonly_rd_stats,
+                             &mbmi->cfl_alpha_idx, &mbmi->cfl_alpha_signs)) {
+        continue;
+      }
+    } else if (is_directional_mode && av1_use_angle_delta(mbmi->bsize) &&
+               intra_mode_cfg->enable_angle_delta) {
       if (sf->intra_sf.chroma_intra_pruning_with_hog &&
           !intra_search_state.dir_mode_skip_mask_ready) {
         static const float thresh[2][4] = {
@@ -649,7 +659,7 @@
         const int is_chroma = 1;
         const int is_intra_frame = frame_is_intra_only(cm);
         prune_intra_mode_with_hog(
-            x, bsize,
+            x, bsize, cm->seq_params->sb_size,
             thresh[is_intra_frame]
                   [sf->intra_sf.chroma_intra_pruning_with_hog - 1],
             intra_search_state.directional_mode_skip_mask, is_chroma);
@@ -672,15 +682,9 @@
       }
     }
     const int mode_cost =
-        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
-        cfl_alpha_rate;
+        mode_costs->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
     this_rate = tokenonly_rd_stats.rate +
                 intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
-    if (mode == UV_CFL_PRED) {
-      assert(is_cfl_allowed(xd) && intra_mode_cfg->enable_cfl_intra);
-      assert(IMPLIES(!xd->lossless[mbmi->segment_id],
-                     cfl_token_rate == tokenonly_rd_stats.rate));
-    }
     this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
 
     if (this_rd < best_rd) {
@@ -726,8 +730,7 @@
   const int num_planes = av1_num_planes(cm);
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate2 = 0;
-  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd,
-          best_model_rd_palette = INT64_MAX;
+  int64_t distortion2 = 0, best_rd_palette = best_rd, this_rd;
   int skippable = 0;
   uint8_t *const best_palette_color_map =
       x->palette_buffer->best_palette_color_map;
@@ -749,11 +752,11 @@
 
   RD_STATS rd_stats_y;
   av1_invalid_rd_stats(&rd_stats_y);
-  av1_rd_pick_palette_intra_sby(
-      cpi, x, bsize, intra_mode_cost[DC_PRED], &best_mbmi_palette,
-      best_palette_color_map, &best_rd_palette, &best_model_rd_palette,
-      &rd_stats_y.rate, NULL, &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
-      ctx, best_blk_skip, best_tx_type_map);
+  av1_rd_pick_palette_intra_sby(cpi, x, bsize, intra_mode_cost[DC_PRED],
+                                &best_mbmi_palette, best_palette_color_map,
+                                &best_rd_palette, &rd_stats_y.rate, NULL,
+                                &rd_stats_y.dist, &rd_stats_y.skip_txfm, NULL,
+                                ctx, best_blk_skip, best_tx_type_map);
   if (rd_stats_y.rate == INT_MAX || pmi->palette_size[0] == 0) {
     this_rd_cost->rdcost = INT64_MAX;
     return skippable;
@@ -859,81 +862,6 @@
   return 0;
 }
 
-/*!\brief Search for the best angle delta for luma prediction
- *
- * \ingroup intra_mode_search
- * \callergraph
- * Given a luma directional intra prediction mode, this function will try to
- * estimate the best delta_angle.
- *
- * \return Returns the new rdcost of the best intra angle.
- */
-static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int *rate, RD_STATS *rd_stats,
-                                       BLOCK_SIZE bsize, int mode_cost,
-                                       int64_t best_rd, int64_t *best_model_rd,
-                                       int skip_model_rd_for_zero_deg) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = xd->mi[0];
-  assert(!is_inter_block(mbmi));
-
-  int best_angle_delta = 0;
-  int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mbmi->tx_size;
-  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
-  uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
-
-  for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
-
-  int first_try = 1;
-  for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      const int64_t best_rd_in =
-          (best_rd == INT64_MAX) ? INT64_MAX
-                                 : (best_rd + (best_rd >> (first_try ? 3 : 5)));
-      const int64_t this_rd = calc_rd_given_intra_angle(
-          cpi, x, bsize, mode_cost, best_rd_in, (1 - 2 * i) * angle_delta,
-          MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-          &best_rd, best_model_rd, best_tx_type_map, best_blk_skip,
-          (skip_model_rd_for_zero_deg & !angle_delta));
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (first_try && this_rd == INT64_MAX) return best_rd;
-      first_try = 0;
-      if (angle_delta == 0) {
-        rd_cost[1] = this_rd;
-        break;
-      }
-    }
-  }
-
-  assert(best_rd != INT64_MAX);
-  for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    for (int i = 0; i < 2; ++i) {
-      int skip_search = 0;
-      const int64_t rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        calc_rd_given_intra_angle(
-            cpi, x, bsize, mode_cost, best_rd, (1 - 2 * i) * angle_delta,
-            MAX_ANGLE_DELTA, rate, rd_stats, &best_angle_delta, &best_tx_size,
-            &best_rd, best_model_rd, best_tx_type_map, best_blk_skip, 0);
-      }
-    }
-  }
-
-  if (rd_stats->rate != INT_MAX) {
-    mbmi->tx_size = best_tx_size;
-    mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
-    const int n4 = bsize_to_num_blk(bsize);
-    memcpy(x->txfm_search_info.blk_skip, best_blk_skip,
-           sizeof(best_blk_skip[0]) * n4);
-    av1_copy_array(xd->tx_type_map, best_tx_type_map, n4);
-  }
-  return best_rd;
-}
-
 /*!\brief Search for the best filter_intra mode when coding inter frame.
  *
  * \ingroup intra_mode_search
@@ -1002,11 +930,14 @@
   }
 }
 
+// Evaluate a given luma intra-mode in inter frames.
 int av1_handle_intra_y_mode(IntraModeSearchState *intra_search_state,
                             const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                             const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
-                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y) {
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]) {
   const AV1_COMMON *cm = &cpi->common;
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1021,7 +952,7 @@
   int known_rate = mode_cost;
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
 
   if (mode != DC_PRED && mode != PAETH_PRED) known_rate += intra_cost_penalty;
   known_rate += AOMMIN(mode_costs->skip_txfm_cost[skip_ctx][0],
@@ -1039,34 +970,40 @@
         !intra_search_state->dir_mode_skip_mask_ready) {
       const float thresh[4] = { -1.2f, 0.0f, 0.0f, 1.2f };
       const int is_chroma = 0;
-      prune_intra_mode_with_hog(
-          x, bsize, thresh[sf->intra_sf.intra_pruning_with_hog - 1],
-          intra_search_state->directional_mode_skip_mask, is_chroma);
+      prune_intra_mode_with_hog(x, bsize, cm->seq_params->sb_size,
+                                thresh[sf->intra_sf.intra_pruning_with_hog - 1],
+                                intra_search_state->directional_mode_skip_mask,
+                                is_chroma);
       intra_search_state->dir_mode_skip_mask_ready = 1;
     }
     if (intra_search_state->directional_mode_skip_mask[mode]) return 0;
-    av1_init_rd_stats(rd_stats_y);
-    rd_stats_y->rate = INT_MAX;
-    int64_t model_rd = INT64_MAX;
-    int rate_dummy;
-    rd_pick_intra_angle_sby(cpi, x, &rate_dummy, rd_stats_y, bsize, mode_cost,
-                            best_rd, &model_rd, 0);
-
-  } else {
-    av1_init_rd_stats(rd_stats_y);
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
-    av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
   }
+  const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+  const int64_t this_model_rd =
+      intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+  if (prune_intra_y_mode(this_model_rd, best_model_rd, top_intra_model_rd,
+                         sf->intra_sf.top_intra_model_count_allowed))
+    return 0;
+  av1_init_rd_stats(rd_stats_y);
+  av1_pick_uniform_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, best_rd);
 
   // Pick filter intra modes.
   if (mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
     int try_filter_intra = 1;
     int64_t best_rd_so_far = INT64_MAX;
     if (rd_stats_y->rate != INT_MAX) {
-      const int tmp_rate = rd_stats_y->rate +
-                           mode_costs->filter_intra_cost[bsize][0] + mode_cost;
+      // best_rd_so_far is the rdcost of DC_PRED without using filter_intra.
+      // Later, in filter intra search, best_rd_so_far is used for comparison.
+      mbmi->filter_intra_mode_info.use_filter_intra = 0;
+      const int tmp_rate =
+          rd_stats_y->rate +
+          intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
       best_rd_so_far = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
       try_filter_intra = (best_rd_so_far / 2) <= best_rd;
+    } else if (sf->intra_sf.skip_filter_intra_in_inter_frames >= 1) {
+      // As rd cost of luma intra dc mode is more than best_rd (i.e.,
+      // rd_stats_y->rate = INT_MAX), skip the evaluation of filter intra modes.
+      try_filter_intra = 0;
     }
 
     if (try_filter_intra) {
@@ -1149,6 +1086,84 @@
   return 1;
 }
 
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+                highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+// Returns a factor to be applied to the RD value based on how well the
+// reconstructed block variance matches the source variance.
+static double intra_rd_variance_factor(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bs) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double variance_rd_factor = 1.0;
+  double src_var = 0.0;
+  double rec_var = 0.0;
+  double var_diff = 0.0;
+  double threshold = 1.0 - (0.25 * cpi->oxcf.speed);
+  unsigned int sse;
+  int i, j;
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+  const int blocks = (bw * bh) / 16;
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (is_cur_buf_hbd(xd)) {
+        src_var +=
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride,
+                          CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
+                          16);
+        rec_var += log(
+            1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                      xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+                      xd->plane[0].dst.stride,
+                      CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
+                      16);
+      } else {
+        src_var +=
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride, all_zeros, 0, &sse) /
+                          16);
+        rec_var += log(
+            1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                      xd->plane[0].dst.buf + i * xd->plane[0].dst.stride + j,
+                      xd->plane[0].dst.stride, all_zeros, 0, &sse) /
+                      16);
+      }
+    }
+  }
+  src_var /= (double)blocks;
+  rec_var /= (double)blocks;
+
+  // Only take action when the spatial complexity is low
+  if ((rec_var < threshold) || (src_var < threshold)) {
+    // Dont allow 0 to prevent / 0 below.
+    src_var += 0.000001;
+    rec_var += 0.000001;
+
+    // Heavier weigth if the reconstruction has lower variance.
+    if (src_var >= rec_var) {
+      var_diff = (src_var - rec_var) * 2;
+      variance_rd_factor = 1.0 + (var_diff / src_var);
+    } else {
+      var_diff = (rec_var - src_var) / 2;
+      variance_rd_factor = 1.0 + (var_diff / src_var);
+    }
+
+    // Limit adjustment;
+    variance_rd_factor = AOMMIN(3.0, variance_rd_factor);
+  }
+
+  return variance_rd_factor;
+}
+
 // Finds the best non-intrabc mode on an intra frame.
 int64_t av1_rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int *rate_tokenonly,
@@ -1188,7 +1203,8 @@
     const float thresh[4] = { -1.2f, -1.2f, -0.6f, 0.4f };
     const int is_chroma = 0;
     prune_intra_mode_with_hog(
-        x, bsize, thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
+        x, bsize, cpi->common.seq_params->sb_size,
+        thresh[cpi->sf.intra_sf.intra_pruning_with_hog - 1],
         directional_mode_skip_mask, is_chroma);
   }
   mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -1198,20 +1214,29 @@
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   MB_MODE_INFO best_mbmi = *mbmi;
-  av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTRA);
+  zero_winner_mode_stats(bsize, MAX_WINNER_MODE_COUNT_INTRA,
+                         x->winner_mode_stats);
   x->winner_mode_count = 0;
 
   // Searches the intra-modes except for intrabc, palette, and filter_intra.
-  for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = INTRA_MODE_START; mode_idx < LUMA_MODE_COUNT;
+       ++mode_idx) {
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
     RD_STATS this_rd_stats;
     int this_rate, this_rate_tokenonly, s;
     int is_diagonal_mode;
     int64_t this_distortion, this_rd;
-    mbmi->mode = intra_rd_search_mode_order[mode_idx];
 
     is_diagonal_mode = av1_is_diagonal_mode(mbmi->mode);
     if (is_diagonal_mode && !cpi->oxcf.intra_mode_cfg.enable_diagonal_intra)
       continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        !cpi->oxcf.intra_mode_cfg.enable_directional_intra)
+      continue;
 
     // The smooth prediction mode appears to be more frequently picked
     // than horizontal / vertical smooth prediction modes. Hence treat
@@ -1225,15 +1250,15 @@
       continue;
 
     // The functionality of filter intra modes and smooth prediction
-    // overlap. Retain the smooth prediction if filter intra modes are
-    // disabled.
+    // overlap. Hence smooth prediction is pruned only if all the
+    // filter intra modes are enabled.
     if (cpi->sf.intra_sf.disable_smooth_intra &&
-        !cpi->sf.intra_sf.disable_filter_intra && mbmi->mode == SMOOTH_PRED)
+        cpi->sf.intra_sf.prune_filter_intra_level == 0 &&
+        mbmi->mode == SMOOTH_PRED)
       continue;
     if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
         mbmi->mode == PAETH_PRED)
       continue;
-    mbmi->angle_delta[PLANE_TYPE_Y] = 0;
 
     // Skip the evaluation of modes that do not match with the winner mode in
     // x->mb_mode_cache.
@@ -1241,24 +1266,27 @@
 
     is_directional_mode = av1_is_directional_mode(mbmi->mode);
     if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
-    if (is_directional_mode && av1_use_angle_delta(bsize) &&
-        cpi->oxcf.intra_mode_cfg.enable_angle_delta) {
-      // Searches through the best angle_delta if this option is available.
-      this_rd_stats.rate = INT_MAX;
-      rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rd_stats, bsize,
-                              bmode_costs[mbmi->mode], best_rd, &best_model_rd,
-                              1);
-    } else {
-      if (model_intra_yrd_and_prune(cpi, x, bsize, &best_model_rd)) {
-        continue;
-      }
+    if (is_directional_mode && av1_use_angle_delta(bsize) == 0 &&
+        mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
 
-      // Builds the actual prediction. The prediction from
-      // model_intra_yrd_and_prune was just an estimation that did not take into
-      // account the effect of txfm pipeline, so we need to redo it for real
-      // here.
-      av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
-    }
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (!(cpi->sf.intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]] &
+          (1 << mbmi->mode)))
+      continue;
+
+    const TX_SIZE tx_size = AOMMIN(TX_32X32, max_txsize_lookup[bsize]);
+    const int64_t this_model_rd =
+        intra_model_rd(&cpi->common, x, 0, bsize, tx_size, /*use_hadamard=*/1);
+    if (prune_intra_y_mode(this_model_rd, &best_model_rd, top_intra_model_rd,
+                           cpi->sf.intra_sf.top_intra_model_count_allowed))
+      continue;
+
+    // Builds the actual prediction. The prediction from
+    // model_intra_yrd_and_prune was just an estimation that did not take into
+    // account the effect of txfm pipeline, so we need to redo it for real
+    // here.
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
     this_rate_tokenonly = this_rd_stats.rate;
     this_distortion = this_rd_stats.dist;
     s = this_rd_stats.skip_txfm;
@@ -1276,6 +1304,12 @@
         this_rd_stats.rate +
         intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
     this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+
+    // Visual quality adjustment based on recon vs source variance.
+    if ((cpi->oxcf.mode == ALLINTRA) && (this_rd != INT64_MAX)) {
+      this_rd = (int64_t)(this_rd * intra_rd_variance_factor(cpi, x, bsize));
+    }
+
     // Collect mode stats for multiwinner mode processing
     const int txfm_search_done = 1;
     store_winner_mode_stats(
@@ -1301,16 +1335,16 @@
   if (try_palette) {
     av1_rd_pick_palette_intra_sby(
         cpi, x, bsize, bmode_costs[DC_PRED], &best_mbmi, best_palette_color_map,
-        &best_rd, &best_model_rd, rate, rate_tokenonly, distortion, skippable,
-        &beat_best_rd, ctx, ctx->blk_skip, ctx->tx_type_map);
+        &best_rd, rate, rate_tokenonly, distortion, skippable, &beat_best_rd,
+        ctx, ctx->blk_skip, ctx->tx_type_map);
   }
 
   // Searches filter_intra
-  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize) &&
-      !cpi->sf.intra_sf.disable_filter_intra) {
+  if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
     if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
                                  skippable, bsize, bmode_costs[DC_PRED],
-                                 &best_rd, &best_model_rd, ctx)) {
+                                 best_mbmi.mode, &best_rd, &best_model_rd,
+                                 ctx)) {
       best_mbmi = *mbmi;
     }
   }

diff --git a/av1/encoder/intra_mode_search.h b/av1/encoder/intra_mode_search.h
index cc2a87b0..5a52440 100644
--- a/av1/encoder/intra_mode_search.h
+++ b/av1/encoder/intra_mode_search.h

@@ -95,6 +95,9 @@
  * \param[out]       mode_cost_y        The cost needed to signal the current
  *                                      intra mode.
  * \param[out]       rd_y               The rdcost of the chosen mode.
+ * \param[in]        best_model_rd      Best model RD seen for this block so far
+ * \param[in]        top_intra_model_rd Top intra model RD seen for this
+ *                                      block so far.
  *
  * \return Returns 1 if a valid intra mode is found, 0 otherwise.
  * The corresponding values in x->e_mbd.mi[0], rd_stats_y, mode_cost_y, and
@@ -106,7 +109,9 @@
                             const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, unsigned int ref_frame_cost,
                             const PICK_MODE_CONTEXT *ctx, RD_STATS *rd_stats_y,
-                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y);
+                            int64_t best_rd, int *mode_cost_y, int64_t *rd_y,
+                            int64_t *best_model_rd,
+                            int64_t top_intra_model_rd[]);
 
 /*!\brief Search through all chroma intra-modes for inter frames.
  *
@@ -262,6 +267,29 @@
   intra_search_state->rate_uv_intra = INT_MAX;
 }
 
+/*! \brief set the luma intra mode and delta angles for a given mode index.
+ * The total number of luma intra mode is LUMA_MODE_COUNT = 61.
+ * The first 13 modes are from DC_PRED to PAETH_PRED, followed by directional
+ * modes. Each of the main 8 directional modes have 6 = MAX_ANGLE_DELTA * 2
+ * delta angles.
+ * \param[in]    mode_idx           mode index in intra mode decision
+ *                                  process.
+ * \param[in]    mbmi               Pointer to structure holding
+ *                                  the mode info for the current macroblock.
+ */
+void set_y_mode_and_delta_angle(const int mode_idx, MB_MODE_INFO *const mbmi);
+
+/*! \brief prune luma intra mode    based on the model rd.
+ * \param[in]    this_model_rd      model rd for current mode.
+ * \param[in]    best_model_rd      Best model RD seen for this block so
+ *                                  far.
+ * \param[in]    top_intra_model_rd Top intra model RD seen for this
+ *                                  block so far.
+ * \param[in]    model_cnt_allowed  The number of top intra model RD allowed.
+ */
+int prune_intra_y_mode(int64_t this_model_rd, int64_t *best_model_rd,
+                       int64_t top_intra_model_rd[], int model_cnt_allowed);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/intra_mode_search_utils.h b/av1/encoder/intra_mode_search_utils.h
index 17f54eb..e6eab3f 100644
--- a/av1/encoder/intra_mode_search_utils.h
+++ b/av1/encoder/intra_mode_search_utils.h

@@ -22,6 +22,7 @@
 #include "av1/common/reconintra.h"
 
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/encodeframe.h"
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
@@ -135,8 +136,13 @@
 }
 #undef FIX_PREC_BITS
 
-static AOM_INLINE void generate_hog(const uint8_t *src, int stride, int rows,
-                                    int cols, float *hist) {
+// Normalizes the hog data.
+static AOM_INLINE void normalize_hog(float total, float *hist) {
+  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+}
+
+static AOM_INLINE void lowbd_generate_hog(const uint8_t *src, int stride,
+                                          int rows, int cols, float *hist) {
   float total = 0.1f;
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
@@ -145,7 +151,7 @@
       const uint8_t *below = &src[c + stride];
       const uint8_t *left = &src[c - 1];
       const uint8_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
+      // Calculate gradient using Sobel filters.
       const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
                      (left[-stride] + 2 * left[0] + left[stride]);
       const int dy = (below[-1] + 2 * below[0] + below[1]) -
@@ -166,13 +172,49 @@
     src += stride;
   }
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+  normalize_hog(total, hist);
 }
 
-static AOM_INLINE void generate_hog_hbd(const uint8_t *src8, int stride,
-                                        int rows, int cols, float *hist) {
+// Computes and stores pixel level gradient information of a given superblock
+// for LBD encode.
+static AOM_INLINE void lowbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                      BLOCK_SIZE sb_size,
+                                                      PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint8_t *src = x->plane[plane].src.buf;
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint8_t *above = &src[c - stride];
+      const uint8_t *below = &src[c + stride];
+      const uint8_t *left = &src[c - 1];
+      const uint8_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static AOM_INLINE void highbd_generate_hog(const uint8_t *src8, int stride,
+                                           int rows, int cols, float *hist) {
   float total = 0.1f;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   src += stride;
   for (int r = 1; r < rows - 1; ++r) {
     for (int c = 1; c < cols - 1; ++c) {
@@ -180,7 +222,7 @@
       const uint16_t *below = &src[c + stride];
       const uint16_t *left = &src[c - 1];
       const uint16_t *right = &src[c + 1];
-      // Calculate gradient using Sobel fitlers.
+      // Calculate gradient using Sobel filters.
       const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
                      (left[-stride] + 2 * left[0] + left[stride]);
       const int dy = (below[-1] + 2 * below[0] + below[1]) -
@@ -201,11 +243,151 @@
     src += stride;
   }
 
-  for (int i = 0; i < BINS; ++i) hist[i] /= total;
+  normalize_hog(total, hist);
+}
+
+// Computes and stores pixel level gradient information of a given superblock
+// for HBD encode.
+static AOM_INLINE void highbd_compute_gradient_info_sb(MACROBLOCK *const x,
+                                                       BLOCK_SIZE sb_size,
+                                                       PLANE_TYPE plane) {
+  PixelLevelGradientInfo *const grad_info_sb =
+      x->pixel_gradient_info + plane * MAX_SB_SQUARE;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[plane].src.buf);
+  const int stride = x->plane[plane].src.stride;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_height = block_size_high[sb_size] >> ss_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+  src += stride;
+  for (int r = 1; r < sb_height - 1; ++r) {
+    for (int c = 1; c < sb_width - 1; ++c) {
+      const uint16_t *above = &src[c - stride];
+      const uint16_t *below = &src[c + stride];
+      const uint16_t *left = &src[c - 1];
+      const uint16_t *right = &src[c + 1];
+      // Calculate gradient using Sobel filters.
+      const int dx = (right[-stride] + 2 * right[0] + right[stride]) -
+                     (left[-stride] + 2 * left[0] + left[stride]);
+      const int dy = (below[-1] + 2 * below[0] + below[1]) -
+                     (above[-1] + 2 * above[0] + above[1]);
+      grad_info_sb[r * sb_width + c].is_dx_zero = (dx == 0);
+      grad_info_sb[r * sb_width + c].abs_dx_abs_dy_sum =
+          (uint16_t)(abs(dx) + abs(dy));
+      grad_info_sb[r * sb_width + c].hist_bin_idx =
+          (dx != 0) ? get_hist_bin_idx(dx, dy) : -1;
+    }
+    src += stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static AOM_INLINE void generate_hog(const uint8_t *src8, int stride, int rows,
+                                    int cols, float *hist, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    highbd_generate_hog(src8, stride, rows, cols, hist);
+    return;
+  }
+#else
+  (void)highbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_generate_hog(src8, stride, rows, cols, hist);
+}
+
+static AOM_INLINE void compute_gradient_info_sb(MACROBLOCK *const x,
+                                                BLOCK_SIZE sb_size,
+                                                PLANE_TYPE plane) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_cur_buf_hbd(&x->e_mbd)) {
+    highbd_compute_gradient_info_sb(x, sb_size, plane);
+    return;
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  lowbd_compute_gradient_info_sb(x, sb_size, plane);
+}
+
+// Function to generate pixel level gradient information for a given superblock.
+// Sets the flags 'is_sb_gradient_cached' for the specific plane-type if
+// gradient info is generated for the same.
+static AOM_INLINE void produce_gradients_for_sb(AV1_COMP *cpi, MACROBLOCK *x,
+                                                BLOCK_SIZE sb_size, int mi_row,
+                                                int mi_col) {
+  const SPEED_FEATURES *sf = &cpi->sf;
+  // Initialise flags related to hog data caching.
+  x->is_sb_gradient_cached[PLANE_TYPE_Y] = false;
+  x->is_sb_gradient_cached[PLANE_TYPE_UV] = false;
+
+  // SB level caching of gradient data may not help in speedup for the following
+  // cases:
+  // (1) Inter frames (due to early intra gating)
+  // (2) When partition_search_type is not SEARCH_PARTITION
+  // Hence, gradient data is computed at block level in such cases.
+
+  if (!frame_is_intra_only(&cpi->common) ||
+      sf->part_sf.partition_search_type != SEARCH_PARTITION)
+    return;
+
+  const int num_planes = av1_num_planes(&cpi->common);
+
+  av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, sb_size);
+
+  if (sf->intra_sf.intra_pruning_with_hog) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_Y);
+    x->is_sb_gradient_cached[PLANE_TYPE_Y] = true;
+  }
+  if (sf->intra_sf.chroma_intra_pruning_with_hog && num_planes > 1) {
+    compute_gradient_info_sb(x, sb_size, PLANE_TYPE_UV);
+    x->is_sb_gradient_cached[PLANE_TYPE_UV] = true;
+  }
+}
+
+// Reuses the pixel level gradient data generated at superblock level for block
+// level histogram computation.
+static AOM_INLINE void generate_hog_using_gradient_cache(const MACROBLOCK *x,
+                                                         int rows, int cols,
+                                                         BLOCK_SIZE sb_size,
+                                                         PLANE_TYPE plane,
+                                                         float *hist) {
+  float total = 0.1f;
+  const int ss_x = x->e_mbd.plane[plane].subsampling_x;
+  const int ss_y = x->e_mbd.plane[plane].subsampling_y;
+  const int sb_width = block_size_wide[sb_size] >> ss_x;
+
+  // Derive the offset from the starting of the superblock in order to locate
+  // the block level gradient data in the cache.
+  const int mi_row_in_sb = x->e_mbd.mi_row & (mi_size_high[sb_size] - 1);
+  const int mi_col_in_sb = x->e_mbd.mi_col & (mi_size_wide[sb_size] - 1);
+  const int block_offset_in_grad_cache =
+      sb_width * (mi_row_in_sb << (MI_SIZE_LOG2 - ss_y)) +
+      (mi_col_in_sb << (MI_SIZE_LOG2 - ss_x));
+  const PixelLevelGradientInfo *grad_info_blk = x->pixel_gradient_info +
+                                                plane * MAX_SB_SQUARE +
+                                                block_offset_in_grad_cache;
+
+  // Retrieve the cached gradient information and generate the histogram.
+  for (int r = 1; r < rows - 1; ++r) {
+    for (int c = 1; c < cols - 1; ++c) {
+      const uint16_t abs_dx_abs_dy_sum =
+          grad_info_blk[r * sb_width + c].abs_dx_abs_dy_sum;
+      if (!abs_dx_abs_dy_sum) continue;
+      total += abs_dx_abs_dy_sum;
+      const bool is_dx_zero = grad_info_blk[r * sb_width + c].is_dx_zero;
+      if (is_dx_zero) {
+        hist[0] += abs_dx_abs_dy_sum >> 1;
+        hist[BINS - 1] += abs_dx_abs_dy_sum >> 1;
+      } else {
+        const int8_t idx = grad_info_blk[r * sb_width + c].hist_bin_idx;
+        assert(idx >= 0 && idx < BINS);
+        hist[idx] += abs_dx_abs_dy_sum;
+      }
+    }
+  }
+  normalize_hog(total, hist);
 }
 
 static INLINE void collect_hog_data(const MACROBLOCK *x, BLOCK_SIZE bsize,
-                                    int plane, float *hog) {
+                                    BLOCK_SIZE sb_size, int plane, float *hog) {
   const MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ss_x = pd->subsampling_x;
@@ -218,12 +400,15 @@
   const int cols =
       ((xd->mb_to_right_edge >= 0) ? bw : (xd->mb_to_right_edge >> 3) + bw) >>
       ss_x;
-  const int src_stride = x->plane[plane].src.stride;
-  const uint8_t *src = x->plane[plane].src.buf;
-  if (is_cur_buf_hbd(xd)) {
-    generate_hog_hbd(src, src_stride, rows, cols, hog);
+
+  // If gradient data is already generated at SB level, reuse the cached data.
+  // Otherwise, compute the data.
+  if (x->is_sb_gradient_cached[plane]) {
+    generate_hog_using_gradient_cache(x, rows, cols, sb_size, plane, hog);
   } else {
-    generate_hog(src, src_stride, rows, cols, hog);
+    const uint8_t *src = x->plane[plane].src.buf;
+    const int src_stride = x->plane[plane].src.stride;
+    generate_hog(src, src_stride, rows, cols, hog, is_cur_buf_hbd(xd));
   }
 
   // Scale the hog so the luma and chroma are on the same scale
@@ -233,17 +418,14 @@
 }
 
 static AOM_INLINE void prune_intra_mode_with_hog(
-    const MACROBLOCK *x, BLOCK_SIZE bsize, float th,
+    const MACROBLOCK *x, BLOCK_SIZE bsize, BLOCK_SIZE sb_size, float th,
     uint8_t *directional_mode_skip_mask, int is_chroma) {
-  aom_clear_system_state();
-
   const int plane = is_chroma ? AOM_PLANE_U : AOM_PLANE_Y;
   float hist[BINS] = { 0.0f };
-  collect_hog_data(x, bsize, plane, hist);
+  collect_hog_data(x, bsize, sb_size, plane, hist);
 
   // Make prediction for each of the mode
   float scores[DIRECTIONAL_MODES] = { 0.0f };
-  aom_clear_system_state();
   av1_nn_predict(hist, &av1_intra_hog_model_nnconfig, 1, scores);
   for (UV_PREDICTION_MODE uv_mode = UV_V_PRED; uv_mode <= UV_D67_PRED;
        uv_mode++) {
@@ -251,8 +433,6 @@
       directional_mode_skip_mask[uv_mode] = 1;
     }
   }
-
-  aom_clear_system_state();
 }
 #undef BINS
 
@@ -306,7 +486,7 @@
       const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
       palette_mode_cost +=
           av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
-                                   n_cache, cpi->common.seq_params.bit_depth);
+                                   n_cache, cpi->common.seq_params->bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -366,7 +546,7 @@
       uint16_t color_cache[2 * PALETTE_MAX_SIZE];
       const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
       palette_mode_cost += av1_palette_color_cost_uv(
-          pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+          pmi, color_cache, n_cache, cpi->common.seq_params->bit_depth);
       palette_mode_cost +=
           av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
       total_rate += palette_mode_cost;
@@ -383,34 +563,6 @@
   return total_rate;
 }
 
-/*!\brief Apply Hadamard or DCT transform
- *
- * \callergraph
- */
-static void av1_quick_txfm(int use_hadamard, TX_SIZE tx_size, int is_hbd,
-                           int bd, const int16_t *src_diff, int src_stride,
-                           tran_low_t *coeff) {
-  if (use_hadamard) {
-    switch (tx_size) {
-      case TX_4X4: aom_hadamard_4x4(src_diff, src_stride, coeff); break;
-      case TX_8X8: aom_hadamard_8x8(src_diff, src_stride, coeff); break;
-      case TX_16X16: aom_hadamard_16x16(src_diff, src_stride, coeff); break;
-      case TX_32X32: aom_hadamard_32x32(src_diff, src_stride, coeff); break;
-      default: assert(0);
-    }
-  } else {
-    assert(IMPLIES(!is_hbd, bd == 8));
-    TxfmParam txfm_param;
-    txfm_param.tx_type = DCT_DCT;
-    txfm_param.tx_size = tx_size;
-    txfm_param.lossless = 0;
-    txfm_param.bd = bd;
-    txfm_param.is_hbd = is_hbd;
-    txfm_param.tx_set_type = EXT_TX_SET_ALL16;
-    av1_fwd_txfm(src_diff, coeff, src_stride, &txfm_param);
-  }
-}
-
 /*!\cond */
 // Makes a quick intra prediction and estimate the rdcost with a model without
 // going through the whole txfm/quantize/itxfm process.
@@ -418,6 +570,7 @@
                               int plane, BLOCK_SIZE plane_bsize,
                               TX_SIZE tx_size, int use_hadamard) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   int row, col;
   assert(!is_inter_block(xd->mi[0]));
   const int stepr = tx_size_high_unit[tx_size];
@@ -438,11 +591,11 @@
       // used in this for loop, therefore we don't need to properly add offset
       // to the buffers.
       av1_subtract_block(
-          xd, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
+          bd_info, txbh, txbw, p->src_diff, block_size_wide[plane_bsize],
           p->src.buf + (((row * p->src.stride) + col) << 2), p->src.stride,
           pd->dst.buf + (((row * pd->dst.stride) + col) << 2), pd->dst.stride);
-      av1_quick_txfm(use_hadamard, tx_size, is_cur_buf_hbd(xd), xd->bd,
-                     p->src_diff, block_size_wide[plane_bsize], p->coeff);
+      av1_quick_txfm(use_hadamard, tx_size, bd_info, p->src_diff,
+                     block_size_wide[plane_bsize], p->coeff);
       satd_cost += aom_satd(p->coeff, tx_size_2d[tx_size]);
     }
   }

diff --git a/av1/encoder/k_means_template.h b/av1/encoder/k_means_template.h
index 84c52a2..e794caf 100644
--- a/av1/encoder/k_means_template.h
+++ b/av1/encoder/k_means_template.h

@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "av1/common/blockd.h"
 #include "av1/encoder/palette.h"
 #include "av1/encoder/random.h"
 
@@ -93,7 +94,9 @@
 void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
                          int n, int k, int max_itr) {
   int pre_centroids[2 * PALETTE_MAX_SIZE];
-  uint8_t pre_indices[MAX_SB_SQUARE];
+  uint8_t pre_indices[MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT];
+
+  assert(n <= MAX_PALETTE_BLOCK_WIDTH * MAX_PALETTE_BLOCK_HEIGHT);
 
 #if AV1_K_MEANS_DIM - 2
   av1_calc_indices_dim1(data, centroids, indices, n, k);

diff --git a/av1/encoder/level.c b/av1/encoder/level.c
index 7a74c46..b4a8751 100644
--- a/av1/encoder/level.c
+++ b/av1/encoder/level.c

@@ -9,8 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/level.h"
 
@@ -353,7 +351,7 @@
     if (spatial_layer_dimensions_present_flag) {
       assert(0 && "Spatial layer dimensions not supported yet.");
     } else {
-      const SequenceHeader *const seq_params = &cm->seq_params;
+      const SequenceHeader *const seq_params = cm->seq_params;
       const int max_frame_width = seq_params->max_frame_width;
       const int max_frame_height = seq_params->max_frame_height;
       luma_samples = max_frame_width * max_frame_height;
@@ -467,13 +465,11 @@
 // op_index is the operating point index.
 void av1_decoder_model_init(const AV1_COMP *const cpi, AV1_LEVEL level,
                             int op_index, DECODER_MODEL *const decoder_model) {
-  aom_clear_system_state();
-
   decoder_model->status = DECODER_MODEL_OK;
   decoder_model->level = level;
 
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   decoder_model->bit_rate = get_max_bitrate(
       av1_level_defs + level, seq_params->tier[op_index], seq_params->profile);
 
@@ -529,8 +525,6 @@
                                      DECODER_MODEL *const decoder_model) {
   if (!decoder_model || decoder_model->status != DECODER_MODEL_OK) return;
 
-  aom_clear_system_state();
-
   const AV1_COMMON *const cm = &cpi->common;
   const int luma_pic_size = cm->superres_upscaled_width * cm->height;
   const int show_existing_frame = cm->show_existing_frame;
@@ -690,7 +684,7 @@
 void av1_init_level_info(AV1_COMP *cpi) {
   for (int op_index = 0; op_index < MAX_NUM_OPERATING_POINTS; ++op_index) {
     AV1LevelInfo *const this_level_info =
-        cpi->level_params.level_info[op_index];
+        cpi->ppi->level_params.level_info[op_index];
     if (!this_level_info) continue;
     memset(this_level_info, 0, sizeof(*this_level_info));
     AV1LevelSpec *const level_spec = &this_level_info->level_spec;
@@ -1048,7 +1042,7 @@
 void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
                            int64_t ts_end) {
   AV1_COMMON *const cm = &cpi->common;
-  const AV1LevelParams *const level_params = &cpi->level_params;
+  const AV1LevelParams *const level_params = &cpi->ppi->level_params;
 
   const int upscaled_width = cm->superres_upscaled_width;
   const int width = cm->width;
@@ -1057,7 +1051,7 @@
   const int tile_rows = cm->tiles.rows;
   const int tiles = tile_cols * tile_rows;
   const int luma_pic_size = upscaled_width * height;
-  const int frame_header_count = level_params->frame_header_count;
+  const int frame_header_count = cpi->frame_header_count;
   const int show_frame = cm->show_frame;
   const int show_existing_frame = cm->show_existing_frame;
 
@@ -1070,12 +1064,11 @@
                  &min_cropped_tile_width, &min_cropped_tile_height,
                  &tile_width_is_valid);
 
-  aom_clear_system_state();
   const double compression_ratio = av1_get_compression_ratio(cm, size);
 
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
   const BITSTREAM_PROFILE profile = seq_params->profile;
   const int is_still_picture = seq_params->still_picture;
   // update level_stats
@@ -1148,7 +1141,7 @@
       if (fail_id != TARGET_LEVEL_OK) {
         const int target_level_major = 2 + (target_level >> 2);
         const int target_level_minor = target_level & 3;
-        aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+        aom_internal_error(cm->error, AOM_CODEC_ERROR,
                            "Failed to encode to the target level %d_%d. %s",
                            target_level_major, target_level_minor,
                            level_fail_messages[fail_id]);

diff --git a/av1/encoder/level.h b/av1/encoder/level.h
index 5e0cce2..2800e3d 100644
--- a/av1/encoder/level.h
+++ b/av1/encoder/level.h

@@ -164,8 +164,6 @@
   uint32_t keep_level_stats;
   // Level information for each operating point.
   AV1LevelInfo *level_info[MAX_NUM_OPERATING_POINTS];
-  // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
-  int frame_header_count;
 } AV1LevelParams;
 
 static INLINE int is_in_operating_point(int operating_point,

diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index af79eb4..c9e1c9a 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h

@@ -25,8 +25,8 @@
 #endif
 
 /*!\cond */
-#define MAX_LAG_BUFFERS 35
-#define MAX_LAP_BUFFERS 35
+#define MAX_LAG_BUFFERS 48
+#define MAX_LAP_BUFFERS 48
 #define MAX_TOTAL_BUFFERS (MAX_LAG_BUFFERS + MAX_LAP_BUFFERS)
 #define LAP_LAG_IN_FRAMES 17
 

diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index 5083084..1a53c23 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c

@@ -95,7 +95,7 @@
 
   // High level params
   ms_params->bsize = bsize;
-  ms_params->vfp = &cpi->fn_ptr[bsize];
+  ms_params->vfp = &cpi->ppi->fn_ptr[bsize];
 
   init_ms_buffers(&ms_params->ms_buffers, x);
 
@@ -167,7 +167,7 @@
                       x->errorperbit, x->sadperbit);
 
   // Subpel variance params
-  ms_params->var_params.vfp = &cpi->fn_ptr[bsize];
+  ms_params->var_params.vfp = &cpi->ppi->fn_ptr[bsize];
   ms_params->var_params.subpel_search_type =
       cpi->sf.mv_sf.use_accurate_subpel_search;
   ms_params->var_params.w = block_size_wide[bsize];
@@ -290,6 +290,9 @@
 
 static INLINE int mv_err_cost_(const MV *mv,
                                const MV_COST_PARAMS *mv_cost_params) {
+  if (mv_cost_params->mv_cost_type == MV_COST_NONE) {
+    return 0;
+  }
   return mv_err_cost(mv, mv_cost_params->ref_mv, mv_cost_params->mvjcost,
                      mv_cost_params->mvcost, mv_cost_params->error_per_bit,
                      mv_cost_params->mv_cost_type);
@@ -1830,7 +1833,7 @@
       const MV dv = { GET_MV_SUBPEL(ref_block_hash.y - y_pos),
                       GET_MV_SUBPEL(ref_block_hash.x - x_pos) };
       if (!av1_is_dv_valid(dv, &cpi->common, xd, mi_row, mi_col, bsize,
-                           cpi->common.seq_params.mib_size_log2))
+                           cpi->common.seq_params->mib_size_log2))
         continue;
 
       FULLPEL_MV hash_mv;
@@ -1957,8 +1960,8 @@
   if (xd->bd != 8) {
     unsigned int sad;
     best_int_mv->as_fullmv = kZeroFullMv;
-    sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                 xd->plane[0].pre[0].buf, ref_stride);
+    sad = cpi->ppi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
@@ -2001,7 +2004,8 @@
   FULLPEL_MV this_mv = best_int_mv->as_fullmv;
   src_buf = x->plane[0].src.buf;
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
-  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  best_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
 
   {
     const uint8_t *const pos[4] = {
@@ -2011,7 +2015,8 @@
       ref_buf + ref_stride,
     };
 
-    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+    cpi->ppi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride,
+                                   this_sad);
   }
 
   for (idx = 0; idx < 4; ++idx) {
@@ -2034,7 +2039,8 @@
 
   ref_buf = get_buf_from_fullmv(&xd->plane[0].pre[0], &this_mv);
 
-  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+  tmp_sad =
+      cpi->ppi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
   if (best_sad > tmp_sad) {
     best_int_mv->as_fullmv = this_mv;
     best_sad = tmp_sad;
@@ -2265,7 +2271,6 @@
 
 // Gets the address of the ref buffer at subpel location (r, c), rounded to the
 // nearest fullpel precision toward - \infty
-
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV mv) {
   const int offset = (mv.row >> 3) * buf->stride + (mv.col >> 3);

diff --git a/av1/encoder/ml.c b/av1/encoder/ml.c
index 57228ec..5078fb1 100644
--- a/av1/encoder/ml.c
+++ b/av1/encoder/ml.c

@@ -16,7 +16,7 @@
 #include "av1/encoder/ml.h"
 
 void av1_nn_output_prec_reduce(float *const output, int num_output) {
-  const int prec_bits = 11;
+  const int prec_bits = 9;
   const int prec = 1 << prec_bits;
   const float inv_prec = (float)(1.0 / prec);
   for (int i = 0; i < num_output; i++) {
@@ -143,14 +143,44 @@
   // Softmax function is invariant to adding the same constant
   // to all input values, so we subtract the maximum input to avoid
   // possible overflow.
-  float max_inp = input[0];
-  for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+  float max_input = input[0];
+  for (int i = 1; i < n; i++) max_input = AOMMAX(max_input, input[i]);
   float sum_out = 0.0f;
   for (int i = 0; i < n; i++) {
     // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
-    const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f);
-    output[i] = (float)exp(normalized_input);
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = expf(normalized_input);
     sum_out += output[i];
   }
   for (int i = 0; i < n; i++) output[i] /= sum_out;
 }
+
+static AOM_INLINE float approx_exp(float y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  union {
+    float as_float;
+    int32_t as_int32;
+  } container;
+  container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C);
+  return container.as_float;
+#undef A
+#undef B
+#undef C
+}
+
+void av1_nn_fast_softmax_16_c(const float *input, float *output) {
+  const int kNumClasses = 16;
+  float max_input = input[0];
+  for (int i = 1; i < kNumClasses; i++) max_input = AOMMAX(max_input, input[i]);
+  float sum_out = 0.0f;
+  for (int i = 0; i < kNumClasses; i++) {
+    // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+    const float normalized_input = AOMMAX(input[i] - max_input, -10.0f);
+    output[i] = approx_exp(normalized_input);
+    sum_out += output[i];
+  }
+  for (int i = 0; i < kNumClasses; i++) output[i] /= sum_out;
+}

diff --git a/av1/encoder/ml.h b/av1/encoder/ml.h
index 62d543d..566f927 100644
--- a/av1/encoder/ml.h
+++ b/av1/encoder/ml.h

@@ -71,6 +71,9 @@
 // output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
 void av1_nn_softmax(const float *input, float *output, int n);
 
+// A faster but less accurate version of av1_nn_softmax(input, output, 16)
+void av1_nn_fast_softmax_16_c(const float *input, float *output);
+
 // Applies a precision reduction to output of av1_nn_predict to prevent
 // mismatches between C and SIMD implementations.
 void av1_nn_output_prec_reduce(float *const output, int num_output);

diff --git a/av1/encoder/model_rd.h b/av1/encoder/model_rd.h
index c353c8f..db5ede4 100644
--- a/av1/encoder/model_rd.h
+++ b/av1/encoder/model_rd.h

@@ -17,7 +17,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/pustats.h"
 #include "av1/encoder/rdopt_utils.h"
-#include "aom_ports/system_state.h"
 #include "config/aom_dsp_rtcd.h"
 
 #ifdef __cplusplus
@@ -134,7 +133,6 @@
     if (dist) *dist = 0;
     return;
   }
-  aom_clear_system_state();
   const double sse_norm = (double)sse / num_samples;
   const double qstepsqr = (double)qstep * qstep;
   const double xqr = log2(sse_norm / qstepsqr);
@@ -145,7 +143,6 @@
   const double dist_f = dist_by_sse_norm_f * sse_norm;
   int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
   int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
-  aom_clear_system_state();
 
   // Check if skip is better
   if (rate_i == 0) {

diff --git a/av1/encoder/motion_search_facade.c b/av1/encoder/motion_search_facade.c
index 198bcd8..f95092e 100644
--- a/av1/encoder/motion_search_facade.c
+++ b/av1/encoder/motion_search_facade.c

@@ -9,12 +9,11 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/reconinter.h"
 
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/interp_search.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
@@ -41,7 +40,7 @@
 // Allow more mesh searches for screen content type on the ARF.
 static int use_fine_search_interval(const AV1_COMP *const cpi) {
   return cpi->is_screen_content_type &&
-         cpi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == ARF_UPDATE &&
          cpi->oxcf.speed <= 2;
 }
 
@@ -62,15 +61,15 @@
   const int mi_col = xd->mi_col;
 
   const BLOCK_SIZE tpl_bsize =
-      convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   const int tplw = mi_size_wide[tpl_bsize];
   const int tplh = mi_size_high[tpl_bsize];
   const int nw = mi_size_wide[bsize] / tplw;
   const int nh = mi_size_high[bsize] / tplh;
 
   if (nw >= 1 && nh >= 1) {
-    const int of_h = mi_row % mi_size_high[cm->seq_params.sb_size];
-    const int of_w = mi_col % mi_size_wide[cm->seq_params.sb_size];
+    const int of_h = mi_row % mi_size_high[cm->seq_params->sb_size];
+    const int of_w = mi_col % mi_size_wide[cm->seq_params->sb_size];
     const int start = of_h / tplh * sb_enc->tpl_stride + of_w / tplw;
     int valid = 1;
 
@@ -119,7 +118,8 @@
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv) {
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   const MotionVectorSearchParams *mv_search_params = &cpi->mv_search_params;
@@ -243,13 +243,9 @@
     }
   }
 
-  // Terminate search with the current ref_idx if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  if (cpi->sf.inter_sf.skip_repeated_full_newmv &&
+  // Terminate search with the current ref_idx based on fullpel mv, rate cost,
+  // and other know cost.
+  if (cpi->sf.inter_sf.skip_newmv_in_drl >= 2 &&
       mbmi->motion_mode == SIMPLE_TRANSLATION &&
       best_mv->as_int != INVALID_MV) {
     int_mv this_mv;
@@ -260,6 +256,7 @@
                         mv_costs->mv_cost_stack, MV_COST_WEIGHT);
     mode_info[ref_mv_idx].full_search_mv.as_int = this_mv.as_int;
     mode_info[ref_mv_idx].full_mv_rate = this_mv_rate;
+    mode_info[ref_mv_idx].full_mv_bestsme = bestsme;
 
     for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
       // Check if the motion search result same as previous results
@@ -280,6 +277,19 @@
           return;
         }
       }
+
+      // Terminate the evaluation of current ref_mv_idx based on bestsme and
+      // drl_cost.
+      const int psme = mode_info[prev_ref_idx].full_mv_bestsme;
+      if (psme == INT_MAX) continue;
+      const int thr =
+          cpi->sf.inter_sf.skip_newmv_in_drl == 3 ? (psme + (psme >> 2)) : psme;
+      if (cpi->sf.inter_sf.skip_newmv_in_drl >= 3 &&
+          mode_info[ref_mv_idx].full_mv_bestsme > thr &&
+          mode_info[prev_ref_idx].drl_cost < mode_info[ref_mv_idx].drl_cost) {
+        best_mv->as_int = INVALID_MV;
+        return;
+      }
     }
   }
 
@@ -289,6 +299,8 @@
 
   const int use_fractional_mv =
       bestsme < INT_MAX && cpi->common.features.cur_frame_force_integer_mv == 0;
+  int best_mv_rate = 0;
+  int mv_rate_calculated = 0;
   if (use_fractional_mv) {
     int_mv fractional_ms_list[3];
     av1_set_fractional_mv(fractional_ms_list);
@@ -337,9 +349,10 @@
             subpel_start_mv = get_mv_from_fullmv(&second_best_mv.as_fullmv);
             if (av1_is_subpelmv_in_range(&ms_params.mv_limits,
                                          subpel_start_mv)) {
+              unsigned int sse;
               const int this_var = mv_search_params->find_fractional_mv_step(
                   xd, cm, &ms_params, subpel_start_mv, &this_best_mv, &dis,
-                  &x->pred_sse[ref], fractional_ms_list);
+                  &sse, fractional_ms_list);
 
               if (!cpi->sf.mv_sf.disable_second_mv) {
                 // If cpi->sf.mv_sf.disable_second_mv is 0, use actual rd cost
@@ -358,11 +371,17 @@
                 int64_t tmp_rd =
                     RDCOST(x->rdmult, tmp_rd_stats.rate + tmp_mv_rate,
                            tmp_rd_stats.dist);
-                if (tmp_rd < rd) best_mv->as_mv = this_best_mv;
+                if (tmp_rd < rd) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
               } else {
                 // If cpi->sf.mv_sf.disable_second_mv = 1, use var to decide the
                 // best MV.
-                if (this_var < best_mv_var) best_mv->as_mv = this_best_mv;
+                if (this_var < best_mv_var) {
+                  best_mv->as_mv = this_best_mv;
+                  x->pred_sse[ref] = sse;
+                }
               }
             }
           }
@@ -379,9 +398,52 @@
         break;
       default: assert(0 && "Invalid motion mode!\n");
     }
+
+    // Terminate search with the current ref_idx based on subpel mv and rate
+    // cost.
+    if (cpi->sf.inter_sf.skip_newmv_in_drl >= 1 && args != NULL &&
+        mbmi->motion_mode == SIMPLE_TRANSLATION &&
+        best_mv->as_int != INVALID_MV) {
+      const int ref_mv_idx = mbmi->ref_mv_idx;
+      best_mv_rate =
+          av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                          mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+      mv_rate_calculated = 1;
+
+      for (int prev_ref_idx = 0; prev_ref_idx < ref_mv_idx; ++prev_ref_idx) {
+        if (!args->single_newmv_valid[prev_ref_idx][ref]) continue;
+        // Check if the motion vectors are the same.
+        if (best_mv->as_int == args->single_newmv[prev_ref_idx][ref].as_int) {
+          // Skip this evaluation if the previous one is skipped.
+          if (mode_info[prev_ref_idx].skip) {
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+          // Compare the rate cost that we current know.
+          const int prev_rate_cost =
+              args->single_newmv_rate[prev_ref_idx][ref] +
+              mode_info[prev_ref_idx].drl_cost;
+          const int this_rate_cost =
+              best_mv_rate + mode_info[ref_mv_idx].drl_cost;
+
+          if (prev_rate_cost <= this_rate_cost) {
+            // If the current rate_cost is worse than the previous rate_cost,
+            // then we terminate the search for this ref_mv_idx.
+            mode_info[ref_mv_idx].skip = 1;
+            break;
+          }
+        }
+      }
+    }
   }
-  *rate_mv = av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
-                             mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+
+  if (mv_rate_calculated) {
+    *rate_mv = best_mv_rate;
+  } else {
+    *rate_mv =
+        av1_mv_bit_cost(&best_mv->as_mv, &ref_mv, mv_costs->nmv_joint_cost,
+                        mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+  }
 }
 
 int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
@@ -895,8 +957,6 @@
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
                                 AOM_PLANE_Y, AOM_PLANE_Y);
 
-  aom_clear_system_state();
-
   if (scaled_ref_frame) {
     xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
   }
@@ -920,7 +980,7 @@
   const uint8_t *dst = xd->plane[0].dst.buf;
   const int dst_stride = xd->plane[0].dst.stride;
 
-  *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+  *var = cpi->ppi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
 
   return best_mv;
 }

diff --git a/av1/encoder/motion_search_facade.h b/av1/encoder/motion_search_facade.h
index 5736f2b..bf81fe2 100644
--- a/av1/encoder/motion_search_facade.h
+++ b/av1/encoder/motion_search_facade.h

@@ -21,20 +21,19 @@
 // TODO(any): rename this struct to something else. There is already another
 // struct called inter_modes_info, which makes this terribly confusing.
 typedef struct {
-  int64_t rd;
   int drl_cost;
-
-  int rate_mv;
-  int_mv mv;
-
   int_mv full_search_mv;
   int full_mv_rate;
+  int full_mv_bestsme;
+  int skip;
 } inter_mode_info;
 
+struct HandleInterModeArgs;
 void av1_single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                               BLOCK_SIZE bsize, int ref_idx, int *rate_mv,
                               int search_range, inter_mode_info *mode_info,
-                              int_mv *best_mv);
+                              int_mv *best_mv,
+                              struct HandleInterModeArgs *const args);
 
 int av1_joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                             BLOCK_SIZE bsize, int_mv *cur_mv,

diff --git a/av1/encoder/mv_prec.c b/av1/encoder/mv_prec.c
index cc81d72..3ff80c8 100644
--- a/av1/encoder/mv_prec.c
+++ b/av1/encoder/mv_prec.c

@@ -11,8 +11,6 @@
 
 #include "config/aom_config.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/encoder/encodemv.h"
 #if !CONFIG_REALTIME_ONLY
 #include "av1/encoder/misc_model_weights.h"
@@ -139,7 +137,6 @@
   const MV lp_diff = use_hp ? truncated_diff : diff;
   const int lp_mv_joint = av1_get_mv_joint(&lp_diff);
 
-  aom_clear_system_state();
   const int mv_joint_rate = get_symbol_cost(joint_cdf, mv_joint);
   const int hp_mv_joint_rate = get_symbol_cost(joint_cdf, hp_mv_joint);
   const int lp_mv_joint_rate = get_symbol_cost(joint_cdf, lp_mv_joint);
@@ -230,7 +227,7 @@
   const int y_stride = cpi->source->y_stride;
   const int px_row = 4 * mi_row, px_col = 4 * mi_col;
   const int buf_is_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
-  const int bd = cm->seq_params.bit_depth;
+  const int bd = cm->seq_params->bit_depth;
   if (buf_is_hbd) {
     uint16_t *source_buf =
         CONVERT_TO_SHORTPTR(cpi->source->y_buffer) + px_row * y_stride + px_col;
@@ -339,8 +336,8 @@
   const int mi_row_end = tile_info->mi_row_end;
   const int mi_col_start = tile_info->mi_col_start;
   const int mi_col_end = tile_info->mi_col_end;
-  const int sb_size_mi = cm->seq_params.mib_size;
-  BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const int sb_size_mi = cm->seq_params->mib_size;
+  BLOCK_SIZE sb_size = cm->seq_params->sb_size;
   for (int mi_row = mi_row_start; mi_row < mi_row_end; mi_row += sb_size_mi) {
     for (int mi_col = mi_col_start; mi_col < mi_col_end; mi_col += sb_size_mi) {
       collect_mv_stats_sb(mv_stats, cpi, mi_row, mi_col, sb_size);
@@ -349,7 +346,12 @@
 }
 
 void av1_collect_mv_stats(AV1_COMP *cpi, int current_q) {
-  MV_STATS *mv_stats = &cpi->mv_stats;
+  MV_STATS *mv_stats;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  mv_stats = &cpi->mv_stats;
+#else
+  mv_stats = &cpi->ppi->mv_stats;
+#endif
   const AV1_COMMON *cm = &cpi->common;
   const int tile_cols = cm->tiles.cols;
   const int tile_rows = cm->tiles.rows;
@@ -376,7 +378,6 @@
   const AV1_COMMON *cm = &cpi->common;
   const int order_hint = cpi->common.current_frame.order_hint;
   const int order_diff = order_hint - mv_stats->order;
-  aom_clear_system_state();
   const float area = (float)(cm->width * cm->height);
   float features[MV_PREC_FEATURE_SIZE] = {
     (float)current_q,
@@ -420,8 +421,12 @@
   }
 #if !CONFIG_REALTIME_ONLY
   else if (cpi->sf.hl_sf.high_precision_mv_usage == LAST_MV_DATA &&
-           av1_frame_allows_smart_mv(cpi) && cpi->mv_stats.valid) {
+           av1_frame_allows_smart_mv(cpi) && cpi->ppi->mv_stats.valid) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
     use_hp = get_smart_mv_prec(cpi, &cpi->mv_stats, qindex);
+#else
+    use_hp = get_smart_mv_prec(cpi, &cpi->ppi->mv_stats, qindex);
+#endif
   }
 #endif  // !CONFIG_REALTIME_ONLY
 

diff --git a/av1/encoder/mv_prec.h b/av1/encoder/mv_prec.h
index 05a95ee..11dcdd8 100644
--- a/av1/encoder/mv_prec.h
+++ b/av1/encoder/mv_prec.h

@@ -22,7 +22,7 @@
 
 static AOM_INLINE int av1_frame_allows_smart_mv(const AV1_COMP *cpi) {
   const int gf_group_index = cpi->gf_frame_index;
-  const int gf_update_type = cpi->gf_group.update_type[gf_group_index];
+  const int gf_update_type = cpi->ppi->gf_group.update_type[gf_group_index];
   return !frame_is_intra_only(&cpi->common) &&
          !(gf_update_type == INTNL_OVERLAY_UPDATE ||
            gf_update_type == OVERLAY_UPDATE);

diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c
index bc7ee02..4b12353 100644
--- a/av1/encoder/nonrd_pickmode.c
+++ b/av1/encoder/nonrd_pickmode.c

@@ -23,7 +23,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/encoder/model_rd.h"
 #include "av1/common/mvref_common.h"
@@ -48,15 +47,24 @@
   PREDICTION_MODE best_mode;
   TX_SIZE best_tx_size;
   MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
   uint8_t best_mode_skip_txfm;
   uint8_t best_mode_initial_skip_flag;
   int_interpfilters best_pred_filter;
+  MOTION_MODE best_motion_mode;
+  WarpedMotionParams wm_params;
+  int num_proj_ref;
 } BEST_PICKMODE;
 
 typedef struct {
   MV_REFERENCE_FRAME ref_frame;
   PREDICTION_MODE pred_mode;
 } REF_MODE;
+
+typedef struct {
+  InterpFilter filter_x;
+  InterpFilter filter_y;
+} INTER_FILTER;
 /*!\endcond */
 
 static const int pos_shift_16x16[4][4] = {
@@ -94,6 +102,14 @@
 static const PREDICTION_MODE intra_mode_list[] = { DC_PRED, V_PRED, H_PRED,
                                                    SMOOTH_PRED };
 
+static const INTER_FILTER filters_ref_set[9] = {
+  { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR }, { EIGHTTAP_SMOOTH, EIGHTTAP_SMOOTH },
+  { EIGHTTAP_REGULAR, EIGHTTAP_SMOOTH },  { EIGHTTAP_SMOOTH, EIGHTTAP_REGULAR },
+  { MULTITAP_SHARP, MULTITAP_SHARP },     { EIGHTTAP_REGULAR, MULTITAP_SHARP },
+  { MULTITAP_SHARP, EIGHTTAP_REGULAR },   { EIGHTTAP_SMOOTH, MULTITAP_SHARP },
+  { MULTITAP_SHARP, EIGHTTAP_SMOOTH }
+};
+
 static INLINE int mode_offset(const PREDICTION_MODE mode) {
   if (mode >= NEARESTMV) {
     return INTER_OFFSET(mode);
@@ -119,11 +135,15 @@
 static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
   bp->best_mode = NEARESTMV;
   bp->best_ref_frame = LAST_FRAME;
+  bp->best_second_ref_frame = NONE_FRAME;
   bp->best_tx_size = TX_8X8;
   bp->best_pred_filter = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
   bp->best_mode_skip_txfm = 0;
   bp->best_mode_initial_skip_flag = 0;
   bp->best_pred = NULL;
+  bp->best_motion_mode = SIMPLE_TRANSLATION;
+  bp->num_proj_ref = 0;
+  memset(&bp->wm_params, 0, sizeof(bp->wm_params));
 }
 
 /*!\brief Runs Motion Estimation for a specific block and specific ref frame.
@@ -353,6 +373,8 @@
   (void)tile_data;
 
   x->pred_mv_sad[ref_frame] = INT_MAX;
+  x->pred_mv0_sad[ref_frame] = INT_MAX;
+  x->pred_mv1_sad[ref_frame] = INT_MAX;
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   // TODO(kyslov) this needs various further optimizations. to be continued..
   assert(yv12 != NULL);
@@ -407,110 +429,6 @@
   }
 }
 
-static void estimate_comp_ref_frame_costs(
-    const AV1_COMMON *cm, const MACROBLOCKD *xd, const ModeCosts *mode_costs,
-    int segment_id, unsigned int (*ref_costs_comp)[REF_FRAMES]) {
-  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
-      memset(ref_costs_comp[ref_frame], 0,
-             REF_FRAMES * sizeof((*ref_costs_comp)[0]));
-  } else {
-    int intra_inter_ctx = av1_get_intra_inter_context(xd);
-    unsigned int base_cost = mode_costs->intra_inter_cost[intra_inter_ctx][1];
-
-    if (cm->current_frame.reference_mode != SINGLE_REFERENCE) {
-      // Similar to single ref, determine cost of compound ref frames.
-      // cost_compound_refs = cost_first_ref + cost_second_ref
-      const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
-      const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
-      const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
-      const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
-      const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
-
-      const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
-      unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
-
-      ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
-          ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
-              base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][1];
-      ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
-      ref_bicomp_costs[ALTREF_FRAME] = 0;
-
-      // cost of first ref frame
-      ref_bicomp_costs[LAST_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST2_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][0];
-      ref_bicomp_costs[LAST3_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
-      ref_bicomp_costs[GOLDEN_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p][0][1];
-
-      ref_bicomp_costs[LAST_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[LAST2_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p1][1][1];
-
-      ref_bicomp_costs[LAST3_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][0];
-      ref_bicomp_costs[GOLDEN_FRAME] +=
-          mode_costs->comp_ref_cost[ref_comp_ctx_p2][2][1];
-
-      // cost of second ref frame
-      ref_bicomp_costs[BWDREF_FRAME] +=
-          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
-      ref_bicomp_costs[ALTREF2_FRAME] +=
-          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
-      ref_bicomp_costs[ALTREF_FRAME] +=
-          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
-
-      ref_bicomp_costs[BWDREF_FRAME] +=
-          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
-      ref_bicomp_costs[ALTREF2_FRAME] +=
-          mode_costs->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
-
-      // cost: if one ref frame is forward ref, the other ref is backward ref
-      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
-          ref_costs_comp[ref0][ref1] =
-              ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
-        }
-      }
-
-      // cost: if both ref frames are the same side.
-      const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
-      const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
-      const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
-      ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
-          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
-      ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
-          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
-      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
-          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
-      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
-          base_cost + mode_costs->comp_ref_type_cost[comp_ref_type_ctx][0] +
-          mode_costs->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
-    } else {
-      for (int ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
-        for (int ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
-          ref_costs_comp[ref0][ref1] = 512;
-      }
-      ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
-      ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
-      ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
-      ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
-    }
-  }
-}
-
 static TX_SIZE calculate_tx_size(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
                                  MACROBLOCK *const x, unsigned int var,
                                  unsigned int sse) {
@@ -729,9 +647,9 @@
               (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3;
           av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, i,
                                         i);
-          var_uv[j] = cpi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride,
-                                               puvd->dst.buf, puvd->dst.stride,
-                                               &sse_uv[j]);
+          var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(
+              puv->src.buf, puv->src.stride, puvd->dst.buf, puvd->dst.stride,
+              &sse_uv[j]);
           if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
               (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
             skip_uv[j] = 1;
@@ -776,8 +694,8 @@
   int rate;
   int64_t dist;
 
-  unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
-                                           pd->dst.buf, pd->dst.stride, &sse);
+  unsigned int var = cpi->ppi->fn_ptr[bsize].vf(
+      p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse);
   xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, x, var, sse);
 
   if (calculate_rd) {
@@ -839,13 +757,14 @@
   int eob_cost = 0;
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
+  const int use_hbd = is_cur_buf_hbd(xd);
 
   (void)mi_row;
   (void)mi_col;
   (void)cpi;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+  if (use_hbd) {
     aom_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
                               p->src.stride, pd->dst.buf, pd->dst.stride,
                               x->e_mbd.bd);
@@ -866,14 +785,15 @@
       if (c < max_blocks_wide) {
         const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
         const int block_offset = BLOCK_OFFSET(block);
+        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
 #if CONFIG_AV1_HIGHBITDEPTH
         tran_low_t *const coeff = p->coeff + block_offset;
         tran_low_t *const qcoeff = p->qcoeff + block_offset;
         tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 #else
-        int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
-        int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
-        int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
+        (void)use_hbd;
 #endif
         uint16_t *const eob = &p->eobs[block];
         const int diff_stride = bw;
@@ -887,48 +807,74 @@
           case TX_32X32:
             assert(0);  // Not used
             break;
+
 #if CONFIG_AV1_HIGHBITDEPTH
           case TX_16X16:
-            aom_hadamard_16x16(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_16x16(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 16 * 16, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
+              av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
           case TX_8X8:
-            aom_hadamard_8x8(src_diff, diff_stride, coeff);
-            av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_hadamard_8x8(src_diff, diff_stride, coeff);
+              av1_quantize_fp(coeff, 8 * 8, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
+              av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
           default:
             assert(tx_size == TX_4X4);
-            aom_fdct4x4(src_diff, coeff, diff_stride);
-            av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
-                            p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
-                            dqcoeff, p->dequant_QTX, eob, scan_order->scan,
-                            scan_order->iscan);
+            if (use_hbd) {
+              aom_fdct4x4(src_diff, coeff, diff_stride);
+              av1_quantize_fp(coeff, 4 * 4, p->zbin_QTX, p->round_fp_QTX,
+                              p->quant_fp_QTX, p->quant_shift_QTX, qcoeff,
+                              dqcoeff, p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            } else {
+              aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
+              av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX,
+                              p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
+                              p->dequant_QTX, eob, scan_order->scan,
+                              scan_order->iscan);
+            }
             break;
 #else
           case TX_16X16:
             aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff);
             av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX,
                             p->quant_fp_QTX, low_qcoeff, low_dqcoeff,
-                            p->dequant_QTX, eob, scan_order->scan);
+                            p->dequant_QTX, eob, scan_order->scan,
+                            scan_order->iscan);
             break;
           case TX_8X8:
             aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff);
             av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
-                            scan_order->scan);
+                            scan_order->scan, scan_order->iscan);
             break;
           default:
             assert(tx_size == TX_4X4);
             aom_fdct4x4_lp(src_diff, low_coeff, diff_stride);
             av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX,
                             low_qcoeff, low_dqcoeff, p->dequant_QTX, eob,
-                            scan_order->scan);
+                            scan_order->scan, scan_order->iscan);
             break;
 #endif
         }
@@ -957,18 +903,32 @@
         const int block_offset = BLOCK_OFFSET(block);
         uint16_t *const eob = &p->eobs[block];
 #if CONFIG_AV1_HIGHBITDEPTH
-        int64_t dummy;
-        tran_low_t *const coeff = p->coeff + block_offset;
-        tran_low_t *const qcoeff = p->qcoeff + block_offset;
-        tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
+        if (use_hbd) {
+          int64_t dummy;
+          tran_low_t *const coeff = p->coeff + block_offset;
+          tran_low_t *const qcoeff = p->qcoeff + block_offset;
+          tran_low_t *const dqcoeff = p->dqcoeff + block_offset;
 
-        if (*eob == 1)
-          this_rdc->rate += (int)abs(qcoeff[0]);
-        else if (*eob > 1)
-          this_rdc->rate += aom_satd(qcoeff, step << 4);
+          if (*eob == 1)
+            this_rdc->rate += (int)abs(qcoeff[0]);
+          else if (*eob > 1)
+            this_rdc->rate += aom_satd(qcoeff, step << 4);
 
-        this_rdc->dist +=
-            av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+          this_rdc->dist +=
+              av1_block_error(coeff, dqcoeff, step << 4, &dummy) >> 2;
+        } else {
+          int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
+          int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
+          int16_t *const low_dqcoeff = (int16_t *)p->dqcoeff + block_offset;
+
+          if (*eob == 1)
+            this_rdc->rate += (int)abs(low_qcoeff[0]);
+          else if (*eob > 1)
+            this_rdc->rate += aom_satd_lp(low_qcoeff, step << 4);
+
+          this_rdc->dist +=
+              av1_block_error_lp(low_coeff, low_dqcoeff, step << 4) >> 2;
+        }
 #else
         int16_t *const low_coeff = (int16_t *)p->coeff + block_offset;
         int16_t *const low_qcoeff = (int16_t *)p->qcoeff + block_offset;
@@ -1171,8 +1131,8 @@
     unsigned int var;
     if (!x->color_sensitivity[i - 1]) continue;
 
-    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
-                             pd->dst.stride, &sse);
+    var = cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                                  pd->dst.stride, &sse);
     assert(sse >= var);
     tot_sse += sse;
 
@@ -1251,12 +1211,12 @@
 
   (void)block;
 
-  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
-  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
-
   av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
   av1_invalid_rd_stats(&this_rdc);
 
+  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+
   if (plane == 0) {
     block_yrd(cpi, x, 0, 0, &this_rdc, &args->skippable, bsize_tx,
               AOMMIN(tx_size, TX_16X16));
@@ -1500,14 +1460,17 @@
 }
 
 #define FILTER_SEARCH_SIZE 2
+
 /*!\brief Searches for the best intrpolation filter
  *
  * \ingroup nonrd_mode_search
  * \callgraph
  * \callergraph
- * Iterates through subset of possible interpolation filters (currently
- * only EIGHTTAP_REGULAR and EIGTHTAP_SMOOTH in both directions) and selects
- * the one that gives lowest RD cost. RD cost is calculated using curvfit model
+ * Iterates through subset of possible interpolation filters (EIGHTTAP_REGULAR,
+ * EIGTHTAP_SMOOTH, MULTITAP_SHARP, depending on FILTER_SEARCH_SIZE) and selects
+ * the one that gives lowest RD cost. RD cost is calculated using curvfit model.
+ * Support for dual filters (different filters in the x & y directions) is
+ * allowed if sf.interp_sf.disable_dual_filter = 0.
  *
  * \param[in]    cpi                  Top-level encoder structure
  * \param[in]    x                    Pointer to structure holding all the
@@ -1542,19 +1505,22 @@
   struct macroblockd_plane *const pd = &xd->plane[0];
   MB_MODE_INFO *const mi = xd->mi[0];
   const int bw = block_size_wide[bsize];
-  RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE] = { 0 };
-  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE] = { 0 };
+  int dim_factor =
+      (cpi->sf.interp_sf.disable_dual_filter == 0) ? FILTER_SEARCH_SIZE : 1;
+  RD_STATS pf_rd_stats[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
+  TX_SIZE pf_tx_size[FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE] = { 0 };
   PRED_BUFFER *current_pred = *this_mode_pred;
   int best_skip = 0;
   int best_early_term = 0;
   int64_t best_cost = INT64_MAX;
   int best_filter_index = -1;
-  InterpFilter filters[FILTER_SEARCH_SIZE] = { EIGHTTAP_REGULAR,
-                                               EIGHTTAP_SMOOTH };
-  for (int i = 0; i < FILTER_SEARCH_SIZE; ++i) {
+  for (int i = 0; i < FILTER_SEARCH_SIZE * FILTER_SEARCH_SIZE; ++i) {
     int64_t cost;
-    InterpFilter filter = filters[i];
-    mi->interp_filters = av1_broadcast_interp_filter(filter);
+    if (cpi->sf.interp_sf.disable_dual_filter &&
+        filters_ref_set[i].filter_x != filters_ref_set[i].filter_y)
+      continue;
+    mi->interp_filters.as_filters.x_filter = filters_ref_set[i].filter_x;
+    mi->interp_filters.as_filters.y_filter = filters_ref_set[i].filter_y;
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
     if (use_model_yrd_large)
       model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
@@ -1562,7 +1528,7 @@
     else
       model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
     pf_rd_stats[i].rate += av1_get_switchable_rate(
-        x, xd, cm->features.interp_filter, cm->seq_params.enable_dual_filter);
+        x, xd, cm->features.interp_filter, cm->seq_params->enable_dual_filter);
     cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
     pf_tx_size[i] = mi->tx_size;
     if (cost < best_cost) {
@@ -1581,11 +1547,15 @@
       }
     }
   }
-  assert(best_filter_index >= 0 && best_filter_index < FILTER_SEARCH_SIZE);
+  assert(best_filter_index >= 0 &&
+         best_filter_index < dim_factor * FILTER_SEARCH_SIZE);
   if (reuse_inter_pred && *this_mode_pred != current_pred)
     free_pred_buffer(current_pred);
 
-  mi->interp_filters = av1_broadcast_interp_filter(filters[best_filter_index]);
+  mi->interp_filters.as_filters.x_filter =
+      filters_ref_set[best_filter_index].filter_x;
+  mi->interp_filters.as_filters.y_filter =
+      filters_ref_set[best_filter_index].filter_y;
   mi->tx_size = pf_tx_size[best_filter_index];
   this_rdc->rate = pf_rd_stats[best_filter_index].rate;
   this_rdc->dist = pf_rd_stats[best_filter_index].dist;
@@ -1595,10 +1565,203 @@
   if (reuse_inter_pred) {
     pd->dst.buf = (*this_mode_pred)->data;
     pd->dst.stride = (*this_mode_pred)->stride;
-  } else if (best_filter_index < FILTER_SEARCH_SIZE - 1) {
+  } else if (best_filter_index < dim_factor * FILTER_SEARCH_SIZE - 1) {
     av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
   }
 }
+#if !CONFIG_REALTIME_ONLY
+#define MOTION_MODE_SEARCH_SIZE 2
+
+static AOM_INLINE int is_warped_mode_allowed(const AV1_COMMON *cm,
+                                             MACROBLOCK *const x,
+                                             const MB_MODE_INFO *mbmi) {
+  const FeatureFlags *const features = &cm->features;
+  const MACROBLOCKD *xd = &x->e_mbd;
+
+  if (has_second_ref(mbmi)) return 0;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mbmi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    return 1;
+  }
+
+  return 0;
+}
+
+static void calc_num_proj_ref(AV1_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+
+  mi->num_proj_ref = 1;
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+  MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+
+  if (features->switchable_motion_mode) {
+    // Determine which motion modes to search if more than SIMPLE_TRANSLATION
+    // is allowed.
+    last_motion_mode_allowed = motion_mode_allowed(
+        xd->global_motion, xd, mi, features->allow_warped_motion);
+  }
+
+  if (last_motion_mode_allowed == WARPED_CAUSAL) {
+    if (warp_sample_info->num < 0) {
+      warp_sample_info->num = av1_findSamples(cm, xd, pts0, pts_inref0);
+    }
+    mi->num_proj_ref = warp_sample_info->num;
+  }
+}
+
+static void search_motion_mode(AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *this_rdc,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               int *this_early_term, int use_model_yrd_large,
+                               int *rate_mv) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const FeatureFlags *const features = &cm->features;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  RD_STATS pf_rd_stats[MOTION_MODE_SEARCH_SIZE] = { 0 };
+  int best_skip = 0;
+  int best_early_term = 0;
+  int64_t best_cost = INT64_MAX;
+  int best_mode_index = -1;
+  const int interp_filter = features->interp_filter;
+
+  const MOTION_MODE motion_modes[MOTION_MODE_SEARCH_SIZE] = {
+    SIMPLE_TRANSLATION, WARPED_CAUSAL
+  };
+  int mode_search_size = is_warped_mode_allowed(cm, x, mi) ? 2 : 1;
+
+  WARP_SAMPLE_INFO *const warp_sample_info =
+      &x->warp_sample_info[mi->ref_frame[0]];
+  int *pts0 = warp_sample_info->pts;
+  int *pts_inref0 = warp_sample_info->pts_inref;
+
+  const int total_samples = mi->num_proj_ref;
+  if (total_samples == 0) {
+    // Do not search WARPED_CAUSAL if there are no samples to use to determine
+    // warped parameters.
+    mode_search_size = 1;
+  }
+
+  const MB_MODE_INFO base_mbmi = *mi;
+  MB_MODE_INFO best_mbmi;
+
+  for (int i = 0; i < mode_search_size; ++i) {
+    int64_t cost = INT64_MAX;
+    MOTION_MODE motion_mode = motion_modes[i];
+    *mi = base_mbmi;
+    mi->motion_mode = motion_mode;
+    if (motion_mode == SIMPLE_TRANSLATION) {
+      mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+
+      av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+      if (use_model_yrd_large)
+        model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                  &pf_rd_stats[i], this_early_term, 1);
+      else
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+      pf_rd_stats[i].rate +=
+          av1_get_switchable_rate(x, xd, cm->features.interp_filter,
+                                  cm->seq_params->enable_dual_filter);
+      cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
+    } else if (motion_mode == WARPED_CAUSAL) {
+      int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+      const ModeCosts *mode_costs = &x->mode_costs;
+      mi->wm_params.wmtype = DEFAULT_WMTYPE;
+      mi->interp_filters =
+          av1_broadcast_interp_filter(av1_unswitchable_filter(interp_filter));
+
+      memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+      memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+      // Select the samples according to motion vector difference
+      if (mi->num_proj_ref > 1) {
+        mi->num_proj_ref = av1_selectSamples(&mi->mv[0].as_mv, pts, pts_inref,
+                                             mi->num_proj_ref, bsize);
+      }
+
+      // Compute the warped motion parameters with a least squares fit
+      //  using the collected samples
+      if (!av1_find_projection(mi->num_proj_ref, pts, pts_inref, bsize,
+                               mi->mv[0].as_mv.row, mi->mv[0].as_mv.col,
+                               &mi->wm_params, mi_row, mi_col)) {
+        if (mi->mode == NEWMV) {
+          const int_mv mv0 = mi->mv[0];
+          const WarpedMotionParams wm_params0 = mi->wm_params;
+          const int num_proj_ref0 = mi->num_proj_ref;
+
+          const int_mv ref_mv = av1_get_ref_mv(x, 0);
+          SUBPEL_MOTION_SEARCH_PARAMS ms_params;
+          av1_make_default_subpel_ms_params(&ms_params, cpi, x, bsize,
+                                            &ref_mv.as_mv, NULL);
+
+          // Refine MV in a small range.
+          av1_refine_warped_mv(xd, cm, &ms_params, bsize, pts0, pts_inref0,
+                               total_samples);
+          if (mi->mv[0].as_int == ref_mv.as_int) {
+            continue;
+          }
+
+          if (mv0.as_int != mi->mv[0].as_int) {
+            // Keep the refined MV and WM parameters.
+            int tmp_rate_mv = av1_mv_bit_cost(
+                &mi->mv[0].as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
+                x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
+            *rate_mv = tmp_rate_mv;
+          } else {
+            // Restore the old MV and WM parameters.
+            mi->mv[0] = mv0;
+            mi->wm_params = wm_params0;
+            mi->num_proj_ref = num_proj_ref0;
+          }
+        }
+        // Build the warped predictor
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      av1_num_planes(cm) - 1);
+        if (use_model_yrd_large)
+          model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd,
+                                    &pf_rd_stats[i], this_early_term, 1);
+        else
+          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rd_stats[i], 1);
+
+        pf_rd_stats[i].rate +=
+            mode_costs->motion_mode_cost[bsize][mi->motion_mode];
+        cost = RDCOST(x->rdmult, pf_rd_stats[i].rate, pf_rd_stats[i].dist);
+      } else {
+        cost = INT64_MAX;
+      }
+    }
+    if (cost < best_cost) {
+      best_mode_index = i;
+      best_cost = cost;
+      best_skip = pf_rd_stats[i].skip_txfm;
+      best_early_term = *this_early_term;
+      best_mbmi = *mi;
+    }
+  }
+  assert(best_mode_index >= 0 && best_mode_index < FILTER_SEARCH_SIZE);
+
+  *mi = best_mbmi;
+  this_rdc->rate = pf_rd_stats[best_mode_index].rate;
+  this_rdc->dist = pf_rd_stats[best_mode_index].dist;
+  this_rdc->sse = pf_rd_stats[best_mode_index].sse;
+  this_rdc->skip_txfm = (best_skip || best_early_term);
+  *this_early_term = best_early_term;
+  if (best_mode_index < FILTER_SEARCH_SIZE - 1) {
+    av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, 0);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
 
 #define COLLECT_PICK_MODE_STAT 0
 
@@ -1618,6 +1781,7 @@
 static void compute_intra_yprediction(const AV1_COMMON *cm,
                                       PREDICTION_MODE mode, BLOCK_SIZE bsize,
                                       MACROBLOCK *x, MACROBLOCKD *xd) {
+  const SequenceHeader *seq_params = cm->seq_params;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   uint8_t *const src_buf_base = p->src.buf;
@@ -1644,10 +1808,11 @@
     for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
       p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
       pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
-      av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                              block_size_high[bsize], tx_size, mode, 0, 0,
-                              FILTER_INTRA_MODES, pd->dst.buf, dst_stride,
-                              pd->dst.buf, dst_stride, 0, 0, plane);
+      av1_predict_intra_block(
+          xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+          block_size_wide[bsize], block_size_high[bsize], tx_size, mode, 0, 0,
+          FILTER_INTRA_MODES, pd->dst.buf, dst_stride, pd->dst.buf, dst_stride,
+          0, 0, plane);
     }
   }
   p->src.buf = src_buf_base;
@@ -1671,7 +1836,9 @@
   const MB_MODE_INFO *left_mi = xd->left_mbmi;
   const PREDICTION_MODE A = av1_above_block_mode(above_mi);
   const PREDICTION_MODE L = av1_left_block_mode(left_mi);
-  bmode_costs = x->mode_costs.y_mode_costs[A][L];
+  const int above_ctx = intra_mode_context[A];
+  const int left_ctx = intra_mode_context[L];
+  bmode_costs = x->mode_costs.y_mode_costs[above_ctx][left_ctx];
 
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
@@ -1734,10 +1901,11 @@
                                               int *force_skip_low_temp_var) {
   AV1_COMMON *const cm = &cpi->common;
   const struct segmentation *const seg = &cm->seg;
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
 
   // For SVC the usage of alt_ref is determined by the ref_frame_flags.
-  int use_alt_ref_frame = cpi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
+  int use_alt_ref_frame =
+      cpi->ppi->use_svc || cpi->sf.rt_sf.use_nonrd_altref_frame;
   int use_golden_ref_frame = 1;
 
   use_ref_frame[LAST_FRAME] = 1;  // we never skip LAST
@@ -1745,7 +1913,6 @@
   if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref) {
     use_golden_ref_frame = 0;
   }
-
   if (cpi->sf.rt_sf.short_circuit_low_temp_var &&
       x->nonrd_prune_ref_frame_search) {
     if (is_small_sb)
@@ -1832,7 +1999,7 @@
 
   int intra_cost_penalty = av1_get_intra_cost_penalty(
       quant_params->base_qindex, quant_params->y_dc_delta_q,
-      cm->seq_params.bit_depth);
+      cm->seq_params->bit_depth);
   int64_t inter_mode_thresh = RDCOST(x->rdmult, intra_cost_penalty, 0);
   int perform_intra_pred = cpi->sf.rt_sf.check_intra_pred_nonrd;
   // For spatial enhancemanent layer: turn off intra prediction if the
@@ -1851,8 +2018,8 @@
   // Adjust thresholds to make intra mode likely tested if the other
   // references (golden, alt) are skipped/not checked. For now always
   // adjust for svc mode.
-  if (cpi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
-                       cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
+  if (cpi->ppi->use_svc || (cpi->sf.rt_sf.use_nonrd_altref_frame == 0 &&
+                            cpi->sf.rt_sf.nonrd_prune_ref_frame_search > 0)) {
     spatial_var_thresh = 150;
     motion_thresh = 0;
   }
@@ -1970,6 +2137,7 @@
       best_pickmode->best_mode = this_mode;
       best_pickmode->best_tx_size = mi->tx_size;
       best_pickmode->best_ref_frame = INTRA_FRAME;
+      best_pickmode->best_second_ref_frame = NONE;
       mi->uv_mode = this_mode;
       mi->mv[0].as_int = INVALID_MV;
       mi->mv[1].as_int = INVALID_MV;
@@ -1979,7 +2147,8 @@
 }
 
 static AOM_INLINE int is_filter_search_enabled(const AV1_COMP *cpi, int mi_row,
-                                               int mi_col, BLOCK_SIZE bsize) {
+                                               int mi_col, BLOCK_SIZE bsize,
+                                               int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
   int enable_filter_search = 0;
 
@@ -1991,6 +2160,8 @@
           (((mi_row + mi_col) >> bsl) +
            get_chessboard_index(cm->current_frame.frame_number)) &
           0x1;
+      if (cyclic_refresh_segment_id_boosted(segment_id))
+        enable_filter_search = 1;
     }
   }
   return enable_filter_search;
@@ -2059,10 +2230,114 @@
     if (ref_frame != LAST_FRAME && mode == NEARMV) return 1;
 
     if (more_prune && bsize >= BLOCK_32X32 && mode == NEARMV) return 1;
+
+    if (extra_prune > 2 && ref_frame != LAST_FRAME) {
+      return 1;
+    }
   }
   return 0;
 }
 
+void set_color_sensitivity(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                           BLOCK_SIZE bsize, int y_sad,
+                           unsigned int source_variance) {
+  const int factor = (bsize >= BLOCK_32X32) ? 2 : 3;
+  NOISE_LEVEL noise_level = kLow;
+  int norm_sad =
+      y_sad >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+  // If the spatial source variance is high and the normalized y_sad
+  // is low, then y-channel is likely good for mode estimation, so keep
+  // color_sensitivity off. For low noise content for now, since there is
+  // some bdrate regression for noisy color clip.
+  if (cpi->noise_estimate.enabled)
+    noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate);
+  if (noise_level == kLow && source_variance > 1000 && norm_sad < 50) {
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+    return;
+  }
+  for (int i = 1; i <= 2; ++i) {
+    if (x->color_sensitivity[i - 1] == 2) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const BLOCK_SIZE bs =
+          get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+      const int uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                                  pd->dst.buf, pd->dst.stride);
+      const int norm_uv_sad =
+          uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]);
+      x->color_sensitivity[i - 1] =
+          uv_sad > (factor * (y_sad >> 3)) && norm_uv_sad > 40;
+    }
+  }
+}
+
+void setup_compound_prediction(AV1_COMP *cpi, MACROBLOCK *x,
+                               struct buf_2d yv12_mb[8][MAX_MB_PLANE],
+                               int *use_ref_frame_mask, int flag_comp,
+                               int *ref_mv_idx) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = xd->mi[0];
+  MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  MV_REFERENCE_FRAME rf[2] = { LAST_FRAME, GOLDEN_FRAME };
+  MV_REFERENCE_FRAME ref_frame_comp;
+  if (flag_comp == 1) {
+    rf[1] = LAST2_FRAME;
+  } else if (flag_comp == 2) {
+    rf[1] = ALTREF_FRAME;
+  }
+  if (!use_ref_frame_mask[rf[1]]) {
+    // Need to setup pred_block, if it hasn't been done in find_predictors.
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, rf[1]);
+    const int num_planes = av1_num_planes(cm);
+    if (yv12 != NULL) {
+      const struct scale_factors *const sf =
+          get_ref_scale_factors_const(cm, rf[1]);
+      av1_setup_pred_block(xd, yv12_mb[rf[1]], yv12, sf, sf, num_planes);
+    }
+  }
+  ref_frame_comp = av1_ref_frame_type(rf);
+  mbmi_ext->mode_context[ref_frame_comp] = 0;
+  mbmi_ext->ref_mv_count[ref_frame_comp] = UINT8_MAX;
+  av1_find_mv_refs(cm, xd, mbmi, ref_frame_comp, mbmi_ext->ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext->global_mvs,
+                   mbmi_ext->mode_context);
+  av1_copy_usable_ref_mv_stack_and_weight(xd, mbmi_ext, ref_frame_comp);
+  *ref_mv_idx = mbmi->ref_mv_idx + 1;
+}
+
+static void set_compound_mode(MACROBLOCK *x, int comp_index, int ref_frame,
+                              int ref_frame2, int ref_mv_idx,
+                              int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES],
+                              PREDICTION_MODE *this_mode) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mi = xd->mi[0];
+  *this_mode = GLOBAL_GLOBALMV;
+  mi->ref_frame[0] = ref_frame;
+  mi->ref_frame[1] = ref_frame2;
+  mi->compound_idx = 1;
+  mi->comp_group_idx = 0;
+  mi->interinter_comp.type = COMPOUND_AVERAGE;
+  MV_REFERENCE_FRAME ref_frame_comp = av1_ref_frame_type(mi->ref_frame);
+  if (comp_index % 3 == 0) {
+    frame_mv[*this_mode][ref_frame].as_int = 0;
+    frame_mv[*this_mode][ref_frame2].as_int = 0;
+  } else if (comp_index % 3 == 1) {
+    *this_mode = NEAREST_NEARESTMV;
+    frame_mv[*this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].this_mv.as_int;
+    frame_mv[*this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][0].comp_mv.as_int;
+  } else if (comp_index % 3 == 2) {
+    *this_mode = NEAR_NEARMV;
+    frame_mv[*this_mode][ref_frame].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].this_mv.as_int;
+    frame_mv[*this_mode][ref_frame2].as_int =
+        xd->ref_mv_stack[ref_frame_comp][ref_mv_idx].comp_mv.as_int;
+  }
+}
+
 void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
                                   MACROBLOCK *x, RD_STATS *rd_cost,
                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
@@ -2071,23 +2346,23 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-
+  const InterpFilter filter_ref = cm->features.interp_filter;
+  const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
   BEST_PICKMODE best_pickmode;
 #if COLLECT_PICK_MODE_STAT
   static mode_search_stat ms_stat;
 #endif
-  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME ref_frame, ref_frame2;
   int_mv frame_mv[MB_MODE_COUNT][REF_FRAMES];
+  int_mv frame_mv_best[MB_MODE_COUNT][REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][REF_FRAMES];
   struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
   RD_STATS this_rdc, best_rdc;
   const unsigned char segment_id = mi->segment_id;
   const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = x->thresh_freq_fact[bsize];
-  const InterpFilter filter_ref = cm->features.interp_filter;
   int best_early_term = 0;
-  unsigned int ref_costs_single[REF_FRAMES],
-      ref_costs_comp[REF_FRAMES][REF_FRAMES];
+  unsigned int ref_costs_single[REF_FRAMES];
   int force_skip_low_temp_var = 0;
   int use_ref_frame_mask[REF_FRAMES] = { 0 };
   unsigned int sse_zeromv_norm = UINT_MAX;
@@ -2104,7 +2379,7 @@
   DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 128 * 128]);
   PRED_BUFFER *this_mode_pred = NULL;
   const int reuse_inter_pred = cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
-                               cm->seq_params.bit_depth == AOM_BITS_8;
+                               cm->seq_params->bit_depth == AOM_BITS_8;
 
   const int bh = block_size_high[bsize];
   const int bw = block_size_wide[bsize];
@@ -2116,7 +2391,6 @@
 #if COLLECT_PICK_MODE_STAT
   aom_usec_timer_start(&ms_stat.timer2);
 #endif
-  const InterpFilter default_interp_filter = EIGHTTAP_REGULAR;
   int64_t thresh_sad_pred = INT64_MAX;
   const int mi_row = xd->mi_row;
   const int mi_col = xd->mi_col;
@@ -2124,27 +2398,25 @@
   int svc_mv_row = 0;
   int force_mv_inter_layer = 0;
   int use_modeled_non_rd_cost = 0;
+  int comp_pred = 0;
+  int num_comp_modes_ref = 0;
+  int tot_num_comp_modes = 9;
+  int ref_mv_idx = 0;
 #if CONFIG_AV1_TEMPORAL_DENOISING
   const int denoise_recheck_zeromv = 1;
   AV1_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
   int denoise_svc_pickmode = 1;
-  const int resize_pending =
-      (cpi->resize_pending_params.width && cpi->resize_pending_params.height &&
-       (cpi->common.width != cpi->resize_pending_params.width ||
-        cpi->common.height != cpi->resize_pending_params.height));
-
+  const int resize_pending = is_frame_resize_pending(cpi);
 #endif
-
+  x->color_sensitivity[0] = x->color_sensitivity_sb[0];
+  x->color_sensitivity[1] = x->color_sensitivity_sb[1];
   init_best_pickmode(&best_pickmode);
 
   const ModeCosts *mode_costs = &x->mode_costs;
 
   estimate_single_ref_frame_costs(cm, xd, mode_costs, segment_id,
                                   ref_costs_single);
-  if (cpi->sf.rt_sf.use_comp_ref_nonrd)
-    estimate_comp_ref_frame_costs(cm, xd, mode_costs, segment_id,
-                                  ref_costs_comp);
 
   memset(&mode_checked[0][0], 0, MB_MODE_COUNT * REF_FRAMES);
   if (reuse_inter_pred) {
@@ -2164,13 +2436,18 @@
   av1_invalid_rd_stats(&best_rdc);
   av1_invalid_rd_stats(&this_rdc);
   av1_invalid_rd_stats(rd_cost);
+  for (int i = 0; i < REF_FRAMES; ++i) {
+    x->warp_sample_info[i].num = -1;
+  }
+
   mi->bsize = bsize;
   mi->ref_frame[0] = NONE_FRAME;
   mi->ref_frame[1] = NONE_FRAME;
 
 #if CONFIG_AV1_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
-    // if (cpi->use_svc) denoise_svc_pickmode = av1_denoise_svc_non_key(cpi);
+    // if (cpi->ppi->use_svc) denoise_svc_pickmode =
+    // av1_denoise_svc_non_key(cpi);
     if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
       av1_denoiser_reset_frame_stats(ctx);
   }
@@ -2183,7 +2460,7 @@
   // to source, so use subpel motion vector to compensate. The nonzero motion
   // is half pixel shifted to left and top, so (-4, -4). This has more effect
   // on higher resolutins, so condition it on that for now.
-  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+  if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
       svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
       cm->width * cm->height > 640 * 480) {
     svc_mv_col = -4;
@@ -2193,6 +2470,21 @@
   get_ref_frame_use_mask(cpi, x, mi, mi_row, mi_col, bsize, gf_temporal_ref,
                          use_ref_frame_mask, &force_skip_low_temp_var);
 
+  // Compound modes per reference pair (GOLDEN_LAST/LAST2_LAST/ALTREF_LAST):
+  // (0_0)/(NEAREST_NEAREST)/(NEAR_NEAR).
+  // For now to reduce slowdowm, use only (0,0) for blocks above 16x16
+  // for non-svc case or on enhancement layers for svc.
+  if (cpi->sf.rt_sf.use_comp_ref_nonrd && is_comp_ref_allowed(bsize)) {
+    if (cpi->ppi->use_svc && cpi->svc.temporal_layer_id == 0)
+      num_comp_modes_ref = 2;
+    else if (bsize > BLOCK_16X16)
+      num_comp_modes_ref = 1;
+    else
+      tot_num_comp_modes = 0;
+  } else {
+    tot_num_comp_modes = 0;
+  }
+
   for (MV_REFERENCE_FRAME ref_frame_iter = LAST_FRAME;
        ref_frame_iter <= ALTREF_FRAME; ++ref_frame_iter) {
     if (use_ref_frame_mask[ref_frame_iter]) {
@@ -2210,10 +2502,10 @@
   const int use_model_yrd_large =
       cpi->oxcf.rc_cfg.mode == AOM_CBR && large_block &&
       !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-      quant_params->base_qindex && cm->seq_params.bit_depth == 8;
+      quant_params->base_qindex && cm->seq_params->bit_depth == 8;
 
   const int enable_filter_search =
-      is_filter_search_enabled(cpi, mi_row, mi_col, bsize);
+      is_filter_search_enabled(cpi, mi_row, mi_col, bsize, segment_id);
 
   // TODO(marpan): Look into reducing these conditions. For now constrain
   // it to avoid significant bdrate loss.
@@ -2239,20 +2531,67 @@
              tx_mode_to_biggest_tx_size[txfm_params->tx_mode_search_type]),
       TX_16X16);
 
-  for (int idx = 0; idx < num_inter_modes; ++idx) {
+  for (int idx = 0; idx < num_inter_modes + tot_num_comp_modes; ++idx) {
     const struct segmentation *const seg = &cm->seg;
 
     int rate_mv = 0;
     int is_skippable;
     int this_early_term = 0;
     int skip_this_mv = 0;
+    comp_pred = 0;
     PREDICTION_MODE this_mode;
     MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
     RD_STATS nonskip_rdc;
     av1_invalid_rd_stats(&nonskip_rdc);
 
-    this_mode = ref_mode_set[idx].pred_mode;
-    ref_frame = ref_mode_set[idx].ref_frame;
+    if (idx >= num_inter_modes) {
+      int comp_index = idx - num_inter_modes;
+      if (comp_index % 3 == 0) {
+        int i = 0;
+        ref_mv_idx = 0;
+        // Only needs to be done once per reference pair.
+        if (comp_index == 3) i = 1;
+        if (comp_index == 6) i = 2;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[i])
+          setup_compound_prediction(cpi, x, yv12_mb, use_ref_frame_mask, i,
+                                    &ref_mv_idx);
+      }
+      // num_comp_modes_ref == 1 only do (0,0)
+      if (num_comp_modes_ref == 1 && comp_index % 3 != 0) continue;
+      // num_comp_modes_ref == 2 only do (0,0) and (NEAREST_NEAREST)
+      if (num_comp_modes_ref == 2 && comp_index % 3 == 2) continue;
+      ref_frame = LAST_FRAME;
+      ref_frame2 = GOLDEN_FRAME;
+      if (comp_index >= 0 && comp_index < 3) {
+        // comp_index = 0,1,2 for (0/NEAREST/NEAR) for GOLDEN_LAST.
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[0] == 0 ||
+            !(cpi->ref_frame_flags & AOM_GOLD_FLAG))
+          continue;
+      } else if (comp_index >= 3 && comp_index < 6) {
+        // comp_index = 3,4,5 for (0/NEAREST/NEAR) for LAST2_LAST.
+        ref_frame2 = LAST2_FRAME;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1] == 0 ||
+            !(cpi->ref_frame_flags & AOM_LAST2_FLAG))
+          continue;
+      } else if (comp_index >= 6 && comp_index < 9) {
+        // comp_index = 6,7,8 for (0/NEAREST/NEAR) for ALTREF_LAST.
+        ref_frame2 = ALTREF_FRAME;
+        if (cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 0 ||
+            !(cpi->ref_frame_flags & AOM_ALT_FLAG))
+          continue;
+      }
+      set_compound_mode(x, comp_index, ref_frame, ref_frame2, ref_mv_idx,
+                        frame_mv, &this_mode);
+      if (this_mode != GLOBAL_GLOBALMV &&
+          frame_mv[this_mode][ref_frame].as_int == 0 &&
+          frame_mv[this_mode][ref_frame2].as_int == 0)
+        continue;
+      comp_pred = 1;
+    } else {
+      this_mode = ref_mode_set[idx].pred_mode;
+      ref_frame = ref_mode_set[idx].ref_frame;
+      ref_frame2 = NONE_FRAME;
+    }
 
 #if COLLECT_PICK_MODE_STAT
     aom_usec_timer_start(&ms_stat.timer1);
@@ -2260,11 +2599,12 @@
 #endif
     mi->mode = this_mode;
     mi->ref_frame[0] = ref_frame;
+    mi->ref_frame[1] = ref_frame2;
 
     if (!use_ref_frame_mask[ref_frame]) continue;
 
     force_mv_inter_layer = 0;
-    if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+    if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 &&
         ((ref_frame == LAST_FRAME && svc->skip_mvsearch_last) ||
          (ref_frame == GOLDEN_FRAME && svc->skip_mvsearch_gf))) {
       // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
@@ -2306,22 +2646,29 @@
         if ((int64_t)(x->pred_mv_sad[ref_frame]) > thresh_sad_pred) continue;
       }
     }
-
-    if (skip_mode_by_threshold(
-            this_mode, ref_frame, frame_mv[this_mode][ref_frame],
-            cpi->rc.frames_since_golden, rd_threshes, rd_thresh_freq_fact,
-            best_rdc.rdcost, best_pickmode.best_mode_skip_txfm,
-            (cpi->sf.rt_sf.nonrd_agressive_skip ? 1 : 0)))
+    // Check for skipping NEARMV based on pred_mv_sad.
+    if (this_mode == NEARMV && x->pred_mv1_sad[ref_frame] != INT_MAX &&
+        x->pred_mv1_sad[ref_frame] > (x->pred_mv0_sad[ref_frame] << 1))
       continue;
 
+    if (!comp_pred) {
+      if (skip_mode_by_threshold(
+              this_mode, ref_frame, frame_mv[this_mode][ref_frame],
+              cpi->rc.frames_since_golden, rd_threshes, rd_thresh_freq_fact,
+              best_rdc.rdcost, best_pickmode.best_mode_skip_txfm,
+              (cpi->sf.rt_sf.nonrd_agressive_skip ? 1 : 0)))
+        continue;
+    }
+
     // Select prediction reference frames.
     for (int i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[ref_frame2][i];
     }
 
     mi->ref_frame[0] = ref_frame;
-    mi->ref_frame[1] = NONE_FRAME;
-    set_ref_ptrs(cm, xd, ref_frame, NONE_FRAME);
+    mi->ref_frame[1] = ref_frame2;
+    set_ref_ptrs(cm, xd, ref_frame, ref_frame2);
 
     if (this_mode == NEWMV && !force_mv_inter_layer) {
       if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
@@ -2340,11 +2687,13 @@
       }
     }
 
-    if (skip_this_mv) continue;
+    if (skip_this_mv && !comp_pred) continue;
 
     mi->mode = this_mode;
     mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
     mi->mv[1].as_int = 0;
+    if (comp_pred) mi->mv[1].as_int = frame_mv[this_mode][ref_frame2].as_int;
+
     if (reuse_inter_pred) {
       if (!this_mode_pred) {
         this_mode_pred = &tmp[3];
@@ -2357,12 +2706,43 @@
 #if COLLECT_PICK_MODE_STAT
     ms_stat.num_nonskipped_searches[bsize][this_mode]++;
 #endif
-    if (enable_filter_search && !force_mv_inter_layer &&
+
+    if (idx == 0) {
+      // Set color sensitivity on first tested mode only.
+      // Use y-sad already computed in find_predictors: take the sad with motion
+      // vector closest to 0; the uv-sad computed below in set_color_sensitivity
+      // is for zeromv.
+      int y_sad = x->pred_mv0_sad[LAST_FRAME];
+      if (x->pred_mv1_sad[LAST_FRAME] != INT_MAX &&
+          (abs(frame_mv[NEARMV][LAST_FRAME].as_mv.col) +
+           abs(frame_mv[NEARMV][LAST_FRAME].as_mv.row)) <
+              (abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.col) +
+               abs(frame_mv[NEARESTMV][LAST_FRAME].as_mv.row)))
+        y_sad = x->pred_mv1_sad[LAST_FRAME];
+      set_color_sensitivity(cpi, x, xd, bsize, y_sad, x->source_variance);
+    }
+    mi->motion_mode = SIMPLE_TRANSLATION;
+#if !CONFIG_REALTIME_ONLY
+    if (cpi->oxcf.motion_mode_cfg.allow_warped_motion) {
+      calc_num_proj_ref(cpi, x, mi);
+    }
+#endif
+
+    if (enable_filter_search && !force_mv_inter_layer && !comp_pred &&
         ((mi->mv[0].as_mv.row & 0x07) || (mi->mv[0].as_mv.col & 0x07)) &&
         (ref_frame == LAST_FRAME || !x->nonrd_prune_ref_frame_search)) {
       search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
                         reuse_inter_pred, &this_mode_pred, &this_early_term,
                         use_model_yrd_large);
+#if !CONFIG_REALTIME_ONLY
+    } else if (cpi->oxcf.motion_mode_cfg.allow_warped_motion &&
+               this_mode == NEWMV) {
+      search_motion_mode(cpi, x, &this_rdc, mi_row, mi_col, bsize,
+                         &this_early_term, use_model_yrd_large, &rate_mv);
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame] = mi->mv[0];
+      }
+#endif
     } else {
       mi->interp_filters =
           (filter_ref == SWITCHABLE)
@@ -2371,7 +2751,25 @@
       if (force_mv_inter_layer)
         mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
 
-      av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      // If it is sub-pel motion and best filter was not selected in
+      // search_filter_ref() for all blocks, then check top and left values and
+      // force smooth if both were selected to be smooth.
+      if (cpi->sf.interp_sf.cb_pred_filter_search &&
+          (mi->mv[0].as_mv.row & 0x07 || mi->mv[0].as_mv.col & 0x07)) {
+        if (xd->left_mbmi && xd->above_mbmi) {
+          if ((xd->left_mbmi->interp_filters.as_filters.x_filter ==
+                   EIGHTTAP_SMOOTH &&
+               xd->above_mbmi->interp_filters.as_filters.x_filter ==
+                   EIGHTTAP_SMOOTH))
+            mi->interp_filters = av1_broadcast_interp_filter(EIGHTTAP_SMOOTH);
+        }
+      }
+      if (!comp_pred)
+        av1_enc_build_inter_predictor_y(xd, mi_row, mi_col);
+      else
+        av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+                                      0);
+
       if (use_model_yrd_large) {
         model_skip_for_sb_y_large(cpi, bsize, mi_row, mi_col, x, xd, &this_rdc,
                                   &this_early_term, use_modeled_non_rd_cost);
@@ -2453,7 +2851,7 @@
     this_rdc.rate += ref_costs_single[ref_frame];
 
     this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
-    if (cpi->oxcf.rc_cfg.mode == AOM_CBR) {
+    if (cpi->oxcf.rc_cfg.mode == AOM_CBR && !comp_pred) {
       newmv_diff_bias(xd, this_mode, &this_rdc, bsize,
                       frame_mv[this_mode][ref_frame].as_mv.row,
                       frame_mv[this_mode][ref_frame].as_mv.col, cpi->speed,
@@ -2481,13 +2879,24 @@
       best_rdc = this_rdc;
       best_early_term = this_early_term;
       best_pickmode.best_mode = this_mode;
+      best_pickmode.best_motion_mode = mi->motion_mode;
+      best_pickmode.wm_params = mi->wm_params;
+      best_pickmode.num_proj_ref = mi->num_proj_ref;
       best_pickmode.best_pred_filter = mi->interp_filters;
       best_pickmode.best_tx_size = mi->tx_size;
       best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_second_ref_frame = ref_frame2;
       best_pickmode.best_mode_skip_txfm = this_rdc.skip_txfm;
       best_pickmode.best_mode_initial_skip_flag =
           (nonskip_rdc.rate == INT_MAX && this_rdc.skip_txfm);
 
+      // This is needed for the compound modes.
+      frame_mv_best[this_mode][ref_frame].as_int =
+          frame_mv[this_mode][ref_frame].as_int;
+      if (ref_frame2 > NONE_FRAME)
+        frame_mv_best[this_mode][ref_frame2].as_int =
+            frame_mv[this_mode][ref_frame2].as_int;
+
       if (reuse_inter_pred) {
         free_pred_buffer(best_pickmode.best_pred);
         best_pickmode.best_pred = this_mode_pred;
@@ -2502,13 +2911,23 @@
   }
 
   mi->mode = best_pickmode.best_mode;
+  mi->motion_mode = best_pickmode.best_motion_mode;
+  mi->wm_params = best_pickmode.wm_params;
+  mi->num_proj_ref = best_pickmode.num_proj_ref;
   mi->interp_filters = best_pickmode.best_pred_filter;
   mi->tx_size = best_pickmode.best_tx_size;
   memset(mi->inter_tx_size, mi->tx_size, sizeof(mi->inter_tx_size));
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
   mi->mv[0].as_int =
-      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
-
+      frame_mv_best[best_pickmode.best_mode][best_pickmode.best_ref_frame]
+          .as_int;
+  mi->mv[1].as_int = 0;
+  if (best_pickmode.best_second_ref_frame > INTRA_FRAME) {
+    mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+    mi->mv[1].as_int = frame_mv_best[best_pickmode.best_mode]
+                                    [best_pickmode.best_second_ref_frame]
+                                        .as_int;
+  }
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   mi->angle_delta[PLANE_TYPE_Y] = 0;
@@ -2523,7 +2942,13 @@
   pd->dst = orig_dst;
   mi->mode = best_pickmode.best_mode;
   mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
   txfm_info->skip_txfm = best_rdc.skip_txfm;
+  if (has_second_ref(mi)) {
+    mi->comp_group_idx = 0;
+    mi->compound_idx = 1;
+    mi->interinter_comp.type = COMPOUND_AVERAGE;
+  }
 
   if (!is_inter_block(mi)) {
     mi->interp_filters = av1_broadcast_interp_filter(SWITCHABLE_FILTERS);
@@ -2555,7 +2980,7 @@
   }
 #endif
 
-  if (cpi->sf.inter_sf.adaptive_rd_thresh) {
+  if (cpi->sf.inter_sf.adaptive_rd_thresh && !has_second_ref(mi)) {
     THR_MODES best_mode_idx =
         mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
     if (best_pickmode.best_ref_frame == INTRA_FRAME) {

diff --git a/av1/encoder/optical_flow.c b/av1/encoder/optical_flow.c
index 82ae9c5..3139a29 100644
--- a/av1/encoder/optical_flow.c
+++ b/av1/encoder/optical_flow.c

@@ -12,13 +12,15 @@
 #include <limits.h>
 
 #include "config/aom_config.h"
+
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
 #include "av1/common/av1_common_int.h"
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/mathutils.h"
 #include "av1/encoder/optical_flow.h"
 #include "av1/encoder/sparse_linear_solver.h"
 #include "av1/encoder/reconinter_enc.h"
-#include "aom_mem/aom_mem.h"
 
 #if CONFIG_OPTICAL_FLOW_API
 
@@ -232,6 +234,12 @@
   // with normalization, gradients may be double values
   double *fullpel_dx = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_x));
   double *fullpel_dy = aom_malloc((ye - ys) * (xe - xs) * sizeof(deriv_y));
+  if (!fullpel_dx || !fullpel_dy) {
+    aom_free(fullpel_dx);
+    aom_free(fullpel_dy);
+    return;
+  }
+
   // TODO(any): This could be more efficient in the case that x_coord
   // and y_coord are integers.. but it may look more messy.
 
@@ -512,9 +520,11 @@
   double *i_x = (double *)aom_malloc(n * n * sizeof(*i_x));
   double *i_y = (double *)aom_malloc(n * n * sizeof(*i_y));
   double *i_t = (double *)aom_malloc(n * n * sizeof(*i_t));
+  double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
+  if (!i_x || !i_y || !i_t || !weights) goto free_lk_buf;
+
   const int expand_multiplier = (int)pow(2, level);
   double sigma = 0.2 * n;
-  double *weights = (double *)aom_malloc(n * n * sizeof(*weights));
   // normalizing doesn't really affect anything since it's applied
   // to every component of M and b
   gaussian(sigma, n, 0, weights);
@@ -564,6 +574,7 @@
       mvs[mv_idx] = mv;
     }
   }
+free_lk_buf:
   aom_free(weights);
   aom_free(i_t);
   aom_free(i_x);
@@ -722,12 +733,16 @@
   double *mv_init_vec = aom_calloc(width * height * 2, sizeof(*mv_init_vec));
   double *temp_b = aom_calloc(width * height * 2, sizeof(*temp_b));
   double *b = aom_calloc(width * height * 2, sizeof(*b));
+  if (!row_pos || !col_pos || !values || !mv_vec || !mv_init_vec || !temp_b ||
+      !b) {
+    goto free_hs_solver_buf;
+  }
 
   // the location idx for neighboring pixels, k < 4 are the 4 direct neighbors
   const int check_locs_y[12] = { 0, 0, -1, 1, -1, -1, 1, 1, 0, 0, -2, 2 };
   const int check_locs_x[12] = { -1, 1, 0, 0, -1, 1, -1, 1, -2, 2, 0, 0 };
 
-  int h, w, checkh, checkw, k;
+  int h, w, checkh, checkw, k, ret;
   const int offset = height * width;
   SPARSE_MTX A;
   int c = 0;
@@ -817,9 +832,10 @@
       }
     }
   }
-  av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
-                      2 * width * height, &A);
-  // substract init mv part from b
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
+  // subtract init mv part from b
   av1_mtx_vect_multi_left(&A, mv_init_vec, temp_b, 2 * width * height);
   for (int i = 0; i < 2 * width * height; i++) {
     b[i] = -temp_b[i];
@@ -859,11 +875,14 @@
     }
   }
 
-  av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
-                      2 * width * height, &A);
+  ret = av1_init_sparse_mtx(row_pos, col_pos, values, c, 2 * width * height,
+                            2 * width * height, &A);
+  if (ret < 0) goto free_hs_solver_buf;
 
   // solve for the mvs
-  av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+  ret = av1_conjugate_gradient_sparse(&A, b, 2 * width * height, mv_vec);
+  if (ret < 0) goto free_hs_solver_buf;
+
   // copy mvs
   for (w = 0; w < width; w++) {
     for (h = 0; h < height; h++) {
@@ -871,6 +890,7 @@
       mvs[h * mv_stride + w].row = mv_vec[w * height + h + offset];
     }
   }
+free_hs_solver_buf:
   aom_free(row_pos);
   aom_free(col_pos);
   aom_free(values);
@@ -882,22 +902,27 @@
 }
 
 // Calculate optical flow from from_frame to to_frame using the H-S method.
-void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
-                  const YV12_BUFFER_CONFIG *to_frame, const int level,
-                  const int mv_stride, const int mv_height, const int mv_width,
-                  const OPFL_PARAMS *opfl_params, LOCALMV *mvs) {
+static void horn_schunck(const YV12_BUFFER_CONFIG *from_frame,
+                         const YV12_BUFFER_CONFIG *to_frame, const int level,
+                         const int mv_stride, const int mv_height,
+                         const int mv_width, const OPFL_PARAMS *opfl_params,
+                         LOCALMV *mvs) {
   // mvs are always on level 0, here we define two new mv arrays that is of size
   // of this level.
   const int fw = from_frame->y_crop_width;
   const int fh = from_frame->y_crop_height;
   const int factor = (int)pow(2, level);
   int w, h, k, init_mv_stride;
-  LOCALMV *init_mvs;
+  LOCALMV *init_mvs = NULL, *refine_mvs = NULL;
+  double *ix = NULL, *iy = NULL, *it = NULL;
+  YV12_BUFFER_CONFIG temp_frame;
+  temp_frame.y_buffer = NULL;
   if (level == 0) {
     init_mvs = mvs;
     init_mv_stride = mv_stride;
   } else {
     init_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+    if (!init_mvs) goto free_hs_buf;
     init_mv_stride = fw;
     for (h = 0; h < fh; h++) {
       for (w = 0; w < fw; w++) {
@@ -908,18 +933,20 @@
       }
     }
   }
-  LOCALMV *refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+  refine_mvs = aom_calloc(fw * fh, sizeof(*mvs));
+  if (!refine_mvs) goto free_hs_buf;
   // temp frame for warping
-  YV12_BUFFER_CONFIG temp_frame;
   temp_frame.y_buffer =
       (uint8_t *)aom_calloc(fh * fw, sizeof(*temp_frame.y_buffer));
+  if (!temp_frame.y_buffer) goto free_hs_buf;
   temp_frame.y_crop_height = fh;
   temp_frame.y_crop_width = fw;
   temp_frame.y_stride = fw;
   // gradient buffers
-  double *ix = aom_calloc(fw * fh, sizeof(*ix));
-  double *iy = aom_calloc(fw * fh, sizeof(*iy));
-  double *it = aom_calloc(fw * fh, sizeof(*it));
+  ix = aom_calloc(fw * fh, sizeof(*ix));
+  iy = aom_calloc(fw * fh, sizeof(*iy));
+  it = aom_calloc(fw * fh, sizeof(*it));
+  if (!ix || !iy || !it) goto free_hs_buf;
   // For each warping step
   for (k = 0; k < opfl_params->warping_steps; k++) {
     // warp from_frame with init_mv
@@ -954,6 +981,7 @@
       }
     }
   }
+free_hs_buf:
   if (level != 0) aom_free(init_mvs);
   aom_free(refine_mvs);
   aom_free(temp_frame.y_buffer);
@@ -977,12 +1005,15 @@
        frame_height / pow(2.0, levels - 1) < 50) &&
       levels > 1)
     levels = levels - 1;
-  uint8_t *images1[MAX_PYRAMID_LEVELS];
-  uint8_t *images2[MAX_PYRAMID_LEVELS];
+  uint8_t *images1[MAX_PYRAMID_LEVELS] = { NULL };
+  uint8_t *images2[MAX_PYRAMID_LEVELS] = { NULL };
+  int *ref_corners = NULL;
+
   images1[0] = from_frame->y_buffer;
   images2[0] = to_frame->y_buffer;
   YV12_BUFFER_CONFIG *buffers1 = aom_malloc(levels * sizeof(*buffers1));
   YV12_BUFFER_CONFIG *buffers2 = aom_malloc(levels * sizeof(*buffers2));
+  if (!buffers1 || !buffers2) goto free_pyramid_buf;
   buffers1[0] = *from_frame;
   buffers2[0] = *to_frame;
   int fw = frame_width;
@@ -991,6 +1022,7 @@
     // TODO(bohanli): may need to extend buffers for better interpolation SIMD
     images1[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images1[i]));
     images2[i] = (uint8_t *)aom_calloc(fh / 2 * fw / 2, sizeof(*images2[i]));
+    if (!images1[i] || !images2[i]) goto free_pyramid_buf;
     int stride;
     if (i == 1)
       stride = from_frame->y_stride;
@@ -1012,11 +1044,11 @@
     buffers2[i] = b;
   }
   // Compute corners for specific frame
-  int *ref_corners = NULL;
   int num_ref_corners = 0;
   if (is_sparse(opfl_params)) {
     int maxcorners = from_frame->y_crop_width * from_frame->y_crop_height;
     ref_corners = aom_malloc(maxcorners * 2 * sizeof(*ref_corners));
+    if (!ref_corners) goto free_pyramid_buf;
     num_ref_corners = detect_corners(from_frame, to_frame, maxcorners,
                                      ref_corners, bit_depth);
   }
@@ -1034,6 +1066,7 @@
                    opfl_params, mvs);
     }
   }
+free_pyramid_buf:
   for (int i = 1; i < levels; i++) {
     aom_free(images1[i]);
     aom_free(images2[i]);
@@ -1082,6 +1115,7 @@
   // Initialize double mvs based on input parameter mvs array
   LOCALMV *localmvs =
       aom_malloc(frame_height * frame_width * sizeof(*localmvs));
+  if (!localmvs) return;
 
   filter_mvs(MV_FILTER_SMOOTH, frame_height, frame_width, localmvs, mvs);
 

diff --git a/av1/encoder/palette.c b/av1/encoder/palette.c
index fd579b7..a689ad2 100644
--- a/av1/encoder/palette.c
+++ b/av1/encoder/palette.c

@@ -217,13 +217,15 @@
 static AOM_INLINE void palette_rd_y(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *centroids, int n,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    uint16_t *color_cache, int n_cache, bool do_header_rd_based_gating,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
     int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
     int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *blk_skip,
-    uint8_t *tx_type_map, int *beat_best_palette_rd) {
+    uint8_t *tx_type_map, int *beat_best_palette_rd,
+    bool *do_header_rd_based_breakout) {
+  if (do_header_rd_based_breakout != NULL) *do_header_rd_based_breakout = false;
   optimize_palette_colors(color_cache, n_cache, n, 1, centroids,
-                          cpi->common.seq_params.bit_depth);
+                          cpi->common.seq_params->bit_depth);
   const int num_unique_colors = av1_remove_duplicates(centroids, n);
   if (num_unique_colors < PALETTE_MIN_SIZE) {
     // Too few unique colors to create a palette. And DC_PRED will work
@@ -231,10 +233,10 @@
     return;
   }
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     for (int i = 0; i < num_unique_colors; ++i) {
       pmi->palette_colors[i] = clip_pixel_highbd(
-          (int)centroids[i], cpi->common.seq_params.bit_depth);
+          (int)centroids[i], cpi->common.seq_params->bit_depth);
     }
   } else {
     for (int i = 0; i < num_unique_colors; ++i) {
@@ -251,17 +253,37 @@
                    1);
   extend_palette_color_map(color_map, cols, rows, block_width, block_height);
 
-  if (model_intra_yrd_and_prune(cpi, x, bsize, best_model_rd)) {
-    return;
+  RD_STATS tokenonly_rd_stats;
+  int this_rate;
+
+  if (do_header_rd_based_gating) {
+    assert(do_header_rd_based_breakout != NULL);
+    const int palette_mode_rate =
+        intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+    const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+    // Less aggressive pruning when prune_luma_palette_size_search_level == 1.
+    const int header_rd_shift =
+        (cpi->sf.intra_sf.prune_luma_palette_size_search_level == 1) ? 1 : 0;
+    // Terminate further palette_size search, if the header cost corresponding
+    // to lower palette_size is more than *best_rd << header_rd_shift. This
+    // logic is implemented with a right shift in the LHS to prevent a possible
+    // overflow with the left shift in RHS.
+    if ((header_rd >> header_rd_shift) > *best_rd) {
+      *do_header_rd_based_breakout = true;
+      return;
+    }
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+  } else {
+    av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
+                                      *best_rd);
+    if (tokenonly_rd_stats.rate == INT_MAX) return;
+    this_rate = tokenonly_rd_stats.rate +
+                intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
   }
 
-  RD_STATS tokenonly_rd_stats;
-  av1_pick_uniform_tx_size_type_yrd(cpi, x, &tokenonly_rd_stats, bsize,
-                                    *best_rd);
-  if (tokenonly_rd_stats.rate == INT_MAX) return;
-  const int palette_mode_cost =
-      intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
-  int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
   int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
   if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->bsize)) {
     tokenonly_rd_stats.rate -= tx_size_cost(x, bsize, mbmi->tx_size);
@@ -302,9 +324,9 @@
 static AOM_INLINE int perform_top_color_palette_search(
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int *top_colors,
-    int start_n, int end_n, int step_size, int *last_n_searched,
-    uint16_t *color_cache, int n_cache, MB_MODE_INFO *best_mbmi,
-    uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+    int start_n, int end_n, int step_size, bool do_header_rd_based_gating,
+    int *last_n_searched, uint16_t *color_cache, int n_cache,
+    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
     int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
     int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
     uint8_t *tx_type_map) {
@@ -317,13 +339,20 @@
   assert(IMPLIES(step_size > 0, start_n < end_n));
   while (!is_iter_over(n, end_n, step_size)) {
     int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
     memcpy(centroids, top_colors, n * sizeof(top_colors[0]));
     palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_palette_rd);
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout);
     *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
     if (beat_best_palette_rd) {
       top_color_winner = n;
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
@@ -343,9 +372,9 @@
     const AV1_COMP *const cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi,
     BLOCK_SIZE bsize, int dc_mode_cost, const int *data, int lower_bound,
     int upper_bound, int start_n, int end_n, int step_size,
-    int *last_n_searched, uint16_t *color_cache, int n_cache,
-    MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    bool do_header_rd_based_gating, int *last_n_searched, uint16_t *color_cache,
+    int n_cache, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
     int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
     uint8_t *best_blk_skip, uint8_t *tx_type_map, uint8_t *color_map,
     int data_points) {
@@ -359,17 +388,24 @@
   assert(IMPLIES(step_size > 0, start_n < end_n));
   while (!is_iter_over(n, end_n, step_size)) {
     int beat_best_palette_rd = 0;
+    bool do_header_rd_based_breakout = false;
     for (int i = 0; i < n; ++i) {
       centroids[i] =
           lower_bound + (2 * i + 1) * (upper_bound - lower_bound) / n / 2;
     }
     av1_k_means(data, centroids, color_map, data_points, n, 1, max_itr);
     palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, n,
-                 color_cache, n_cache, best_mbmi, best_palette_color_map,
-                 best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                 skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                 &beat_best_palette_rd);
+                 color_cache, n_cache, do_header_rd_based_gating, best_mbmi,
+                 best_palette_color_map, best_rd, rate, rate_tokenonly,
+                 distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+                 tx_type_map, &beat_best_palette_rd,
+                 &do_header_rd_based_breakout);
     *last_n_searched = n;
+    if (do_header_rd_based_breakout) {
+      // Terminate palette_size search by setting last_n_searched to end_n.
+      *last_n_searched = end_n;
+      break;
+    }
     if (beat_best_palette_rd) {
       top_color_winner = n;
     } else if (cpi->sf.intra_sf.prune_palette_search_level == 2) {
@@ -434,9 +470,9 @@
 void av1_rd_pick_palette_intra_sby(
     const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int dc_mode_cost,
     MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map, int64_t *best_rd,
-    int64_t *best_model_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int *beat_best_rd, PICK_MODE_CONTEXT *ctx,
-    uint8_t *best_blk_skip, uint8_t *tx_type_map) {
+    int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+    int *beat_best_rd, PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
+    uint8_t *tx_type_map) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   assert(!is_inter_block(mbmi));
@@ -450,7 +486,7 @@
   int block_width, block_height, rows, cols;
   av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
                            &cols);
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
   const int is_hbd = seq_params->use_highbitdepth;
   const int bit_depth = seq_params->bit_depth;
   int unused;
@@ -494,6 +530,19 @@
       count_buf[top_colors[i]] = 0;
     }
 
+    // The following are the approaches used for header rdcost based gating
+    // for early termination for different values of prune_palette_search_level.
+    // 0: Pruning based on header rdcost for ascending order palette_size
+    // search.
+    // 1: When colors > PALETTE_MIN_SIZE, enabled only for coarse palette_size
+    // search and for finer search do_header_rd_based_gating parameter is
+    // explicitly passed as 'false'.
+    // 2: Enabled only for ascending order palette_size search and for
+    // descending order search do_header_rd_based_gating parameter is explicitly
+    // passed as 'false'.
+    const bool do_header_rd_based_gating =
+        cpi->sf.intra_sf.prune_luma_palette_size_search_level != 0;
+
     // TODO(huisu@google.com): Try to avoid duplicate computation in cases
     // where the dominant colors and the k-means results are similar.
     if ((cpi->sf.intra_sf.prune_palette_search_level == 1) &&
@@ -527,12 +576,11 @@
       const int min_n = start_n_lookup_table[max_n];
       const int step_size = step_size_lookup_table[max_n];
       assert(min_n >= PALETTE_MIN_SIZE);
-
       // Perform top color coarse palette search to find the winner candidate
       const int top_color_winner = perform_top_color_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
-          step_size, &unused, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          step_size, do_header_rd_based_gating, &unused, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
           distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for dominant colors
@@ -543,19 +591,20 @@
         // perform finer search for the winner candidate
         perform_top_color_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, stage2_min_n,
-            stage2_max_n + 1, stage2_step_size, &unused, color_cache, n_cache,
-            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-            best_blk_skip, tx_type_map);
+            stage2_max_n + 1, stage2_step_size,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map);
       }
       // K-means clustering.
       // Perform k-means coarse palette search to find the winner candidate
       const int k_means_winner = perform_k_means_palette_search(
           cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-          min_n, max_n + 1, step_size, &unused, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
-          distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-          color_map, rows * cols);
+          min_n, max_n + 1, step_size, do_header_rd_based_gating, &unused,
+          color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+          rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+          best_blk_skip, tx_type_map, color_map, rows * cols);
       // Evaluate neighbors for the winner color (if winner is found) in the
       // above coarse search for k-means
       if (k_means_winner <= max_n) {
@@ -565,30 +614,29 @@
         // perform finer search for the winner candidate
         perform_k_means_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-            start_n_stage2, end_n_stage2 + 1, step_size_stage2, &unused,
-            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
-            best_model_rd, rate, rate_tokenonly, distortion, skippable,
-            beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
-            rows * cols);
+            start_n_stage2, end_n_stage2 + 1, step_size_stage2,
+            /*do_header_rd_based_gating=*/false, &unused, color_cache, n_cache,
+            best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
+            distortion, skippable, beat_best_rd, ctx, best_blk_skip,
+            tx_type_map, color_map, rows * cols);
       }
     } else {
       const int max_n = AOMMIN(colors, PALETTE_MAX_SIZE),
                 min_n = PALETTE_MIN_SIZE;
-      // Perform top color palette search in descending order
-      int last_n_searched = max_n;
+      // Perform top color palette search in ascending order
+      int last_n_searched = min_n;
       perform_top_color_palette_search(
-          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n, min_n - 1,
-          -1, &last_n_searched, color_cache, n_cache, best_mbmi,
-          best_palette_color_map, best_rd, best_model_rd, rate, rate_tokenonly,
+          cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n, max_n + 1,
+          1, do_header_rd_based_gating, &last_n_searched, color_cache, n_cache,
+          best_mbmi, best_palette_color_map, best_rd, rate, rate_tokenonly,
           distortion, skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map);
-
-      if (last_n_searched > min_n) {
-        // Search in ascending order until we get to the previous best
+      if (last_n_searched < max_n) {
+        // Search in descending order until we get to the previous best
         perform_top_color_palette_search(
-            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, min_n,
-            last_n_searched, 1, &unused, color_cache, n_cache, best_mbmi,
-            best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            cpi, x, mbmi, bsize, dc_mode_cost, data, top_colors, max_n,
+            last_n_searched, -1, /*do_header_rd_based_gating=*/false, &unused,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
             best_blk_skip, tx_type_map);
       }
       // K-means clustering.
@@ -598,27 +646,28 @@
         centroids[0] = lower_bound;
         centroids[1] = upper_bound;
         palette_rd_y(cpi, x, mbmi, bsize, dc_mode_cost, data, centroids, colors,
-                     color_cache, n_cache, best_mbmi, best_palette_color_map,
-                     best_rd, best_model_rd, rate, rate_tokenonly, distortion,
-                     skippable, beat_best_rd, ctx, best_blk_skip, tx_type_map,
-                     NULL);
+                     color_cache, n_cache, /*do_header_rd_based_gating=*/false,
+                     best_mbmi, best_palette_color_map, best_rd, rate,
+                     rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+                     best_blk_skip, tx_type_map, NULL, NULL);
       } else {
-        // Perform k-means palette search in descending order
-        last_n_searched = max_n;
+        // Perform k-means palette search in ascending order
+        last_n_searched = min_n;
         perform_k_means_palette_search(
             cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-            max_n, min_n - 1, -1, &last_n_searched, color_cache, n_cache,
-            best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-            rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
+            min_n, max_n + 1, 1, do_header_rd_based_gating, &last_n_searched,
+            color_cache, n_cache, best_mbmi, best_palette_color_map, best_rd,
+            rate, rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
             best_blk_skip, tx_type_map, color_map, rows * cols);
-        if (last_n_searched > min_n) {
-          // Search in ascending order until we get to the previous best
+        if (last_n_searched < max_n) {
+          // Search in descending order until we get to the previous best
           perform_k_means_palette_search(
               cpi, x, mbmi, bsize, dc_mode_cost, data, lower_bound, upper_bound,
-              min_n, last_n_searched, 1, &unused, color_cache, n_cache,
-              best_mbmi, best_palette_color_map, best_rd, best_model_rd, rate,
-              rate_tokenonly, distortion, skippable, beat_best_rd, ctx,
-              best_blk_skip, tx_type_map, color_map, rows * cols);
+              max_n, last_n_searched, -1, /*do_header_rd_based_gating=*/false,
+              &unused, color_cache, n_cache, best_mbmi, best_palette_color_map,
+              best_rd, rate, rate_tokenonly, distortion, skippable,
+              beat_best_rd, ctx, best_blk_skip, tx_type_map, color_map,
+              rows * cols);
         }
       }
     }
@@ -645,10 +694,10 @@
                            mbmi->bsize));
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const BLOCK_SIZE bsize = mbmi->bsize;
-  const SequenceHeader *const seq_params = &cpi->common.seq_params;
+  const SequenceHeader *const seq_params = cpi->common.seq_params;
   int this_rate;
   int64_t this_rd;
-  int colors_u, colors_v, colors;
+  int colors_u, colors_v;
   int colors_threshold_u = 0, colors_threshold_v = 0, colors_threshold = 0;
   const int src_stride = x->plane[1].src.stride;
   const uint8_t *const src_u = x->plane[1].src.buf;
@@ -679,7 +728,6 @@
   uint16_t color_cache[2 * PALETTE_MAX_SIZE];
   const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
 
-  colors = colors_u > colors_v ? colors_u : colors_v;
   colors_threshold = colors_threshold_u > colors_threshold_v
                          ? colors_threshold_u
                          : colors_threshold_v;
@@ -729,15 +777,17 @@
       }
     }
 
-    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
-         --n) {
+    const int colors = colors_u > colors_v ? colors_u : colors_v;
+    const int max_colors =
+        colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+    for (n = PALETTE_MIN_SIZE; n <= max_colors; ++n) {
       for (i = 0; i < n; ++i) {
         centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
         centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
       }
       av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
       optimize_palette_colors(color_cache, n_cache, n, 2, centroids,
-                              cpi->common.seq_params.bit_depth);
+                              cpi->common.seq_params->bit_depth);
       // Sort the U channel colors in ascending order.
       for (i = 0; i < 2 * (n - 1); i += 2) {
         int min_idx = i;
@@ -766,10 +816,23 @@
         }
       }
 
-      av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
-      if (tokenonly_rd_stats.rate == INT_MAX) continue;
-      this_rate = tokenonly_rd_stats.rate +
-                  intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      if (cpi->sf.intra_sf.early_term_chroma_palette_size_search) {
+        const int palette_mode_rate =
+            intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+        const int64_t header_rd = RDCOST(x->rdmult, palette_mode_rate, 0);
+        // Terminate further palette_size search, if header cost corresponding
+        // to lower palette_size is more than the best_rd.
+        if (header_rd >= *best_rd) break;
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate + palette_mode_rate;
+      } else {
+        av1_txfm_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+        if (tokenonly_rd_stats.rate == INT_MAX) continue;
+        this_rate = tokenonly_rd_stats.rate +
+                    intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+      }
+
       this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
       if (this_rd < *best_rd) {
         *best_rd = this_rd;
@@ -811,7 +874,7 @@
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
         data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
       } else {

diff --git a/av1/encoder/palette.h b/av1/encoder/palette.h
index 85af473..7d9a72f 100644
--- a/av1/encoder/palette.h
+++ b/av1/encoder/palette.h

@@ -185,10 +185,9 @@
 void av1_rd_pick_palette_intra_sby(
     const struct AV1_COMP *cpi, struct macroblock *x, BLOCK_SIZE bsize,
     int dc_mode_cost, MB_MODE_INFO *best_mbmi, uint8_t *best_palette_color_map,
-    int64_t *best_rd, int64_t *best_model_rd, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int *beat_best_rd,
-    struct PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip,
-    uint8_t *tx_type_map);
+    int64_t *best_rd, int *rate, int *rate_tokenonly, int64_t *distortion,
+    int *skippable, int *beat_best_rd, struct PICK_MODE_CONTEXT *ctx,
+    uint8_t *best_blk_skip, uint8_t *tx_type_map);
 
 /*!\brief Search for the best palette in the chroma plane.
  *

diff --git a/av1/encoder/partition_search.c b/av1/encoder/partition_search.c
index 2c1897d..1bc6b32 100644
--- a/av1/encoder/partition_search.c
+++ b/av1/encoder/partition_search.c

@@ -9,8 +9,6 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 #include "av1/common/blockd.h"
 #include "av1/common/enums.h"
@@ -25,6 +23,7 @@
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_search.h"
+#include "av1/encoder/partition_strategy.h"
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tokenize.h"
 #include "av1/encoder/var_based_part.h"
@@ -34,6 +33,9 @@
 #include "av1/encoder/tune_vmaf.h"
 #endif
 
+#define COLLECT_MOTION_SEARCH_FEATURE_SB 0
+#define ML_PARTITION_WHOLE_TREE_DECISION 0
+
 void av1_reset_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
   part_sf->partition_search_type = SEARCH_PARTITION;
   part_sf->less_rectangular_check_level = 0;
@@ -42,7 +44,6 @@
   part_sf->default_max_partition_size = BLOCK_LARGEST;
   part_sf->default_min_partition_size = BLOCK_4X4;
   part_sf->adjust_var_based_rd_partitioning = 0;
-  part_sf->allow_partition_search_skip = 0;
   part_sf->max_intra_bsize = BLOCK_LARGEST;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -63,15 +64,132 @@
   part_sf->simple_motion_search_prune_rect = 0;
   part_sf->simple_motion_search_early_term_none = 0;
   part_sf->simple_motion_search_reduce_search_steps = 0;
-  part_sf->intra_cnn_split = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
   part_sf->prune_ext_part_using_split_info = 0;
   part_sf->prune_rectangular_split_based_on_qidx = 0;
   part_sf->early_term_after_none_split = 0;
   part_sf->ml_predict_breakout_level = 0;
   part_sf->prune_sub_8x8_partition_level = 0;
+  part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
+  part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
 }
 
+#if !CONFIG_REALTIME_ONLY
+// If input |features| is NULL, write tpl stats to file for each super block.
+// Otherwise, store tpl stats to |features|.
+// The tpl stats is computed in the unit of tpl_bsize_1d (16x16).
+// When writing to text file:
+// The first row contains super block position, super block size,
+// tpl unit length, number of units in the super block.
+// The second row contains the intra prediction cost for each unit.
+// The third row contains the inter prediction cost for each unit.
+// The forth row contains the motion compensated dependency cost for each unit.
+static void collect_tpl_stats_sb(const AV1_COMP *const cpi,
+                                 const BLOCK_SIZE bsize, const int mi_row,
+                                 const int mi_col,
+                                 aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  if (gf_group->update_type[cpi->gf_frame_index] == INTNL_OVERLAY_UPDATE ||
+      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE) {
+    return;
+  }
+
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[cpi->gf_frame_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  // If tpl stats is not established, early return
+  if (!tpl_data->ready || gf_group->max_layer_depth_allowed == 0) {
+    features->sb_features.tpl_features.available = 0;
+    return;
+  }
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / step) + ((mi_width % step) > 0);
+  const int row_steps = (mi_height / step) + ((mi_height % step) > 0);
+  const int num_blocks = col_steps * row_steps;
+
+  if (features == NULL) {
+    char filename[256];
+    snprintf(filename, sizeof(filename), "%s/tpl_feature_sb%d",
+             cpi->oxcf.partition_info_path, cpi->sb_counter);
+    FILE *pfile = fopen(filename, "w");
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+            tpl_data->tpl_bsize_1d, num_blocks);
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->intra_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        fprintf(pfile, "%.0f", (double)this_stats->inter_cost);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fprintf(pfile, "\n");
+    count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        fprintf(pfile, "%.0f", (double)mc_dep_delta);
+        if (count < num_blocks - 1) fprintf(pfile, ",");
+        ++count;
+      }
+    }
+    fclose(pfile);
+  } else {
+    features->sb_features.tpl_features.available = 1;
+    features->sb_features.tpl_features.tpl_unit_length = tpl_data->tpl_bsize_1d;
+    features->sb_features.tpl_features.num_units = num_blocks;
+    int count = 0;
+    for (int row = 0; row < mi_height; row += step) {
+      for (int col = 0; col < mi_width; col += step) {
+        TplDepStats *this_stats =
+            &tpl_stats[av1_tpl_ptr_pos(mi_row + row, mi_col + col, tpl_stride,
+                                       tpl_data->tpl_stats_block_mis_log2)];
+        const int64_t mc_dep_delta =
+            RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                   this_stats->mc_dep_dist);
+        features->sb_features.tpl_features.intra_cost[count] =
+            this_stats->intra_cost;
+        features->sb_features.tpl_features.inter_cost[count] =
+            this_stats->inter_cost;
+        features->sb_features.tpl_features.mc_dep_cost[count] = mc_dep_delta;
+        ++count;
+      }
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
 static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
                               FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
                               int blk_row, int blk_col,
@@ -322,7 +440,7 @@
                            xd->block_ref_scale_factors[ref], num_planes);
     }
     const int start_plane = (cpi->sf.rt_sf.reuse_inter_pred_nonrd &&
-                             cm->seq_params.bit_depth == AOM_BITS_8)
+                             cm->seq_params->bit_depth == AOM_BITS_8)
                                 ? 1
                                 : 0;
     av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
@@ -434,10 +552,10 @@
     cfl_store_block(xd, mbmi->bsize, mbmi->tx_size);
   }
   if (!dry_run) {
-    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+    if (cpi->oxcf.pass == AOM_RC_ONE_PASS && cpi->svc.temporal_layer_id == 0 &&
         cpi->sf.rt_sf.use_temporal_noise_estimate &&
-        (!cpi->use_svc ||
-         (cpi->use_svc &&
+        (!cpi->ppi->use_svc ||
+         (cpi->ppi->use_svc &&
           !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
       update_zeromv_cnt(cpi, mbmi, mi_row, mi_col, bsize);
@@ -468,12 +586,14 @@
     }
   }
 
+#if !CONFIG_REALTIME_ONLY
   const AV1_COMMON *const cm = &cpi->common;
   if (cm->delta_q_info.delta_q_present_flag &&
       !cpi->sf.rt_sf.use_nonrd_pick_mode) {
     x->rdmult =
         av1_get_hier_tpl_rdmult(cpi, x, bsize, mi_row, mi_col, x->rdmult);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_SSIM) {
     av1_set_ssim_rdmult(cpi, &x->errorperbit, bsize, mi_row, mi_col,
@@ -593,6 +713,37 @@
     av1_nonrd_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
 }
 
+// For real time row-mt enabled multi-threaded encoding with cost update
+// frequency set to COST_UPD_TILE/COST_UPD_OFF, tile ctxt is not updated at
+// superblock level. Thus, it is not required for the encoding of top-right
+// superblock be complete for updating tile ctxt. However, when encoding a block
+// whose right edge is also the superblock edge, intra and inter mode evaluation
+// (ref mv list population) require the encoding of the top-right superblock to
+// be complete. So, here, we delay the waiting of threads until the need for the
+// data from the top-right superblock region.
+static AOM_INLINE void wait_for_top_right_sb(
+    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
+    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
+    BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int sb_size_in_mi = mi_size_wide[sb_size];
+  const int bw_in_mi = mi_size_wide[bsize];
+  const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
+  const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
+  const int top_right_block_in_sb =
+      (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
+
+  // Don't wait if the block is the not the top-right block in the superblock.
+  if (!top_right_block_in_sb) return;
+
+  // Wait for the top-right superblock to finish encoding.
+  const int sb_row_in_tile =
+      (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
+  const int sb_col_in_tile =
+      (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
+
+  (*(enc_row_mt->sync_read_ptr))(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+}
+
 /*!\brief Interface for AV1 mode search for an individual coding block
  *
  * \ingroup partition_search
@@ -631,7 +782,7 @@
                           RD_STATS *rd_cost, PARTITION_TYPE partition,
                           BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                           RD_STATS best_rd) {
-  if (best_rd.rdcost < 0) {
+  if (cpi->sf.part_sf.use_best_rd_for_pruning && best_rd.rdcost < 0) {
     ctx->rd_stats.rdcost = INT64_MAX;
     ctx->rd_stats.skip_txfm = 0;
     av1_invalid_rd_stats(rd_cost);
@@ -640,7 +791,8 @@
 
   av1_set_offsets(cpi, &tile_data->tile_info, x, mi_row, mi_col, bsize);
 
-  if (ctx->rd_mode_is_ready) {
+  if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab &&
+      ctx->rd_mode_is_ready) {
     assert(ctx->mic.bsize == bsize);
     assert(ctx->mic.partition == partition);
     rd_cost->rate = ctx->rd_stats.rate;
@@ -660,12 +812,16 @@
 
   int i;
 
+  // This is only needed for real time row-mt enabled multi-threaded encoding
+  // with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
+  wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, rd_pick_sb_modes_time);
 #endif
 
-  aom_clear_system_state();
-
   mbmi = xd->mi[0];
   mbmi->bsize = bsize;
   mbmi->partition = partition;
@@ -713,6 +869,13 @@
   av1_set_error_per_bit(&x->errorperbit, x->rdmult);
   av1_rd_cost_update(x->rdmult, &best_rd);
 
+  // If set best_rd.rdcost to INT64_MAX, the encoder will not use any previous
+  // rdcost information for the following mode search.
+  // Disabling the feature could get some coding gain, with encoder slowdown.
+  if (!cpi->sf.part_sf.use_best_rd_for_pruning) {
+    av1_invalid_rd_stats(&best_rd);
+  }
+
   // Find best coding mode & reconstruct the MB so it is available
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
@@ -791,11 +954,11 @@
 #if CONFIG_ENTROPY_STATS
   // delta quant applies to both intra and inter
   const int super_block_upper_left =
-      ((xd->mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-      ((xd->mi_col & (cm->seq_params.mib_size - 1)) == 0);
+      ((xd->mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+      ((xd->mi_col & (cm->seq_params->mib_size - 1)) == 0);
   const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
   if (delta_q_info->delta_q_present_flag &&
-      (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+      (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
       super_block_upper_left) {
     const int dq = (mbmi->current_qindex - xd->current_base_qindex) /
                    delta_q_info->delta_q_res;
@@ -994,7 +1157,7 @@
         }
       }
 
-      if (cm->seq_params.enable_interintra_compound &&
+      if (cm->seq_params->enable_interintra_compound &&
           is_interintra_allowed(mbmi)) {
         const int bsize_group = size_group_lookup[bsize];
         if (mbmi->ref_frame[1] == INTRA_FRAME) {
@@ -1055,7 +1218,7 @@
                mbmi->motion_mode == SIMPLE_TRANSLATION);
 
         const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                         cm->seq_params.enable_masked_compound;
+                                         cm->seq_params->enable_masked_compound;
         if (masked_compound_used) {
           const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
 #if CONFIG_ENTROPY_STATS
@@ -1100,7 +1263,7 @@
   if (inter_block && cm->features.interp_filter == SWITCHABLE &&
       mbmi->motion_mode != WARPED_CAUSAL &&
       !is_nontrans_global_motion(xd, mbmi)) {
-    update_filter_type_cdf(xd, mbmi, cm->seq_params.enable_dual_filter);
+    update_filter_type_cdf(xd, mbmi, cm->seq_params->enable_dual_filter);
   }
   if (inter_block &&
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -1207,8 +1370,8 @@
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  const int subsampling_x = cm->seq_params.subsampling_x;
-  const int subsampling_y = cm->seq_params.subsampling_y;
+  const int subsampling_x = cm->seq_params->subsampling_x;
+  const int subsampling_y = cm->seq_params->subsampling_y;
 
   av1_set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
   const int origin_mult = x->rdmult;
@@ -1221,9 +1384,9 @@
     set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
                    x->cb_offset[PLANE_TYPE_UV]);
     assert(x->cb_offset[PLANE_TYPE_Y] <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
     assert(x->cb_offset[PLANE_TYPE_UV] <
-           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
             (subsampling_x + subsampling_y)));
   }
 
@@ -1231,7 +1394,7 @@
 
   if (!dry_run) {
     update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
-    if (bsize == cpi->common.seq_params.sb_size && mbmi->skip_txfm == 1 &&
+    if (bsize == cpi->common.seq_params->sb_size && mbmi->skip_txfm == 1 &&
         cm->delta_q_info.delta_lf_present_flag) {
       const int frame_lf_count =
           av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
@@ -1249,11 +1412,11 @@
 
     // delta quant applies to both intra and inter
     const int super_block_upper_left =
-        ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
-        ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+        ((mi_row & (cm->seq_params->mib_size - 1)) == 0) &&
+        ((mi_col & (cm->seq_params->mib_size - 1)) == 0);
     const DeltaQInfo *const delta_q_info = &cm->delta_q_info;
     if (delta_q_info->delta_q_present_flag &&
-        (bsize != cm->seq_params.sb_size || !mbmi->skip_txfm) &&
+        (bsize != cm->seq_params->sb_size || !mbmi->skip_txfm) &&
         super_block_upper_left) {
       xd->current_base_qindex = mbmi->current_qindex;
       if (delta_q_info->delta_lf_present_flag) {
@@ -1489,6 +1652,27 @@
   update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
 }
 
+static AOM_INLINE int is_adjust_var_based_part_enabled(
+    AV1_COMMON *const cm, const PARTITION_SPEED_FEATURES *const part_sf,
+    BLOCK_SIZE bsize) {
+  if (part_sf->partition_search_type != VAR_BASED_PARTITION) return 0;
+  if (part_sf->adjust_var_based_rd_partitioning == 0 ||
+      part_sf->adjust_var_based_rd_partitioning > 3)
+    return 0;
+
+  if (part_sf->adjust_var_based_rd_partitioning == 1) {
+    return bsize <= BLOCK_32X32;
+  } else {
+    if (bsize <= BLOCK_32X32) return 1;
+    const int is_larger_qindex = cm->quant_params.base_qindex > 190;
+    if (part_sf->adjust_var_based_rd_partitioning == 2) {
+      const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
+      return is_360p_or_larger && is_larger_qindex && bsize == BLOCK_64X64;
+    }
+  }
+  return 0;
+}
+
 /*!\brief AV1 block partition search (partition estimation and partial search).
 *
 * \ingroup partition_search
@@ -1547,6 +1731,7 @@
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE bs_type = mib[0]->bsize;
+  x->try_merge_partition = 0;
 
   if (pc_tree->none == NULL) {
     pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
@@ -1579,12 +1764,7 @@
   const int orig_rdmult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
 
-  if (cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-      ((cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 &&
-        bsize <= BLOCK_32X32) ||
-       (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-        cm->quant_params.base_qindex > 190 && bsize <= BLOCK_32X32 &&
-        !frame_is_intra_only(cm)))) {
+  if (is_adjust_var_based_part_enabled(cm, &cpi->sf.part_sf, bsize)) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
       sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
@@ -1604,6 +1784,7 @@
         mi_row + hbs < mi_params->mi_rows &&
         mi_col + hbs < mi_params->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
+      x->try_merge_partition = 1;
       pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
                     PARTITION_NONE, bsize, ctx_none, invalid_rdc);
 
@@ -1683,11 +1864,18 @@
       }
       break;
     case PARTITION_SPLIT:
-      if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 &&
-          none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
-        av1_invalid_rd_stats(&last_part_rdc);
-        break;
+      if (none_rdc.rate < INT_MAX && none_rdc.skip_txfm == 1) {
+        const MB_MODE_INFO *mbmi = xd->mi[0];
+        // Try to skip split partition evaluation based on none partition
+        // characteristics.
+        if (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 1 ||
+            (cpi->sf.part_sf.adjust_var_based_rd_partitioning == 2 &&
+             is_inter_block(mbmi) && mbmi->mode != NEWMV)) {
+          av1_invalid_rd_stats(&last_part_rdc);
+          break;
+        }
       }
+
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
@@ -1731,7 +1919,7 @@
   }
 
   if ((cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION &&
-       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 2) &&
+       cpi->sf.part_sf.adjust_var_based_rd_partitioning > 3) &&
       partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
       (mi_row + bs < mi_params->mi_rows ||
        mi_row + hbs == mi_params->mi_rows) &&
@@ -1800,11 +1988,14 @@
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == cm->seq_params.sb_size)
+  if (bsize == cm->seq_params->sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, encode_sb_time);
+#endif
   if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
       // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
@@ -1817,6 +2008,9 @@
                 pc_tree, NULL);
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, encode_sb_time);
+#endif
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
@@ -1828,6 +2022,7 @@
                            int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
                            PARTITION_TYPE partition,
                            PICK_MODE_CONTEXT *const ctx, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
   TileInfo *const tile = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1836,25 +2031,61 @@
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
   MB_MODE_INFO *mbmi = xd->mi[0];
   mbmi->partition = partition;
-  // Nonrd pickmode does not currently support second/combined reference.
-  assert(!has_second_ref(mbmi));
   av1_update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
-  const int subsampling_x = cpi->common.seq_params.subsampling_x;
-  const int subsampling_y = cpi->common.seq_params.subsampling_y;
+  const int subsampling_x = cpi->common.seq_params->subsampling_x;
+  const int subsampling_y = cpi->common.seq_params->subsampling_y;
   if (!dry_run) {
     set_cb_offsets(x->mbmi_ext_frame->cb_offset, x->cb_offset[PLANE_TYPE_Y],
                    x->cb_offset[PLANE_TYPE_UV]);
     assert(x->cb_offset[PLANE_TYPE_Y] <
-           (1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]));
+           (1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]));
     assert(x->cb_offset[PLANE_TYPE_UV] <
-           ((1 << num_pels_log2_lookup[cpi->common.seq_params.sb_size]) >>
+           ((1 << num_pels_log2_lookup[cpi->common.seq_params->sb_size]) >>
             (subsampling_x + subsampling_y)));
   }
   encode_superblock(cpi, tile_data, td, tp, dry_run, bsize, rate);
   if (!dry_run) {
     update_cb_offsets(x, bsize, subsampling_x, subsampling_y);
+    if (has_second_ref(mbmi)) {
+      if (mbmi->compound_idx == 0 ||
+          mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+        mbmi->comp_group_idx = 0;
+      else
+        mbmi->comp_group_idx = 1;
+      mbmi->compound_idx = 1;
+    }
+    RD_COUNTS *const rdc = &td->rd_counts;
+    if (mbmi->skip_mode) {
+      assert(!frame_is_intra_only(cm));
+      rdc->skip_mode_used_flag = 1;
+      if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+          has_second_ref(mbmi)) {
+        rdc->compound_ref_used_flag = 1;
+      }
+      set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+    } else {
+      const int seg_ref_active =
+          segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+      if (!seg_ref_active) {
+        // If the segment reference feature is enabled we have only a single
+        // reference frame allowed for the segment so exclude it from
+        // the reference frame counts used to work out probabilities.
+        if (is_inter_block(mbmi)) {
+          av1_collect_neighbors_ref_counts(xd);
+          if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT &&
+              has_second_ref(mbmi)) {
+            // This flag is also updated for 4x4 blocks
+            rdc->compound_ref_used_flag = 1;
+          }
+          set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+        }
+      }
+    }
     if (tile_data->allow_update_cdf) update_stats(&cpi->common, td);
   }
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm &&
+      !cpi->cyclic_refresh->skip_over4x4)
+    av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize);
   // TODO(Ravi/Remya): Move this copy function to a better logical place
   // This function will copy the best mode information from block
   // level (x->mbmi_ext) to frame level (cpi->mbmi_ext_info.frame_base). This
@@ -1865,29 +2096,6 @@
   x->rdmult = origin_mult;
 }
 
-static AOM_INLINE void wait_for_top_right_sb(
-    AV1EncRowMultiThreadInfo *enc_row_mt, AV1EncRowMultiThreadSync *row_mt_sync,
-    TileInfo *tile_info, BLOCK_SIZE sb_size, int sb_mi_size_log2,
-    BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int sb_size_in_mi = mi_size_wide[sb_size];
-  const int bw_in_mi = mi_size_wide[bsize];
-  const int blk_row_in_sb = mi_row & (sb_size_in_mi - 1);
-  const int blk_col_in_sb = mi_col & (sb_size_in_mi - 1);
-  const int top_right_block_in_sb =
-      (blk_row_in_sb == 0) && (blk_col_in_sb + bw_in_mi >= sb_size_in_mi);
-
-  // Don't wait if the block is the not the top-right block in the superblock.
-  if (!top_right_block_in_sb) return;
-
-  // Wait for the top-right superblock to finish encoding.
-  const int sb_row_in_tile =
-      (mi_row - tile_info->mi_row_start) >> sb_mi_size_log2;
-  const int sb_col_in_tile =
-      (mi_col - tile_info->mi_col_start) >> sb_mi_size_log2;
-
-  (*(enc_row_mt->sync_read_ptr))(row_mt_sync, sb_row_in_tile, sb_col_in_tile);
-}
-
 /*!\brief Top level function to pick block mode for non-RD optimized case
  *
  * \ingroup partition_search
@@ -1935,14 +2143,15 @@
   TxfmSearchInfo *txfm_info = &x->txfm_search_info;
   int i;
 
+  // This is only needed for real time row-mt enabled multi-threaded encoding
+  // with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF.
   wait_for_top_right_sb(&cpi->mt_info.enc_row_mt, &tile_data->row_mt_sync,
-                        &tile_data->tile_info, cm->seq_params.sb_size,
-                        cm->seq_params.mib_size_log2, bsize, mi_row, mi_col);
+                        &tile_data->tile_info, cm->seq_params->sb_size,
+                        cm->seq_params->mib_size_log2, bsize, mi_row, mi_col);
 
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, rd_pick_sb_modes_time);
 #endif
-  aom_clear_system_state();
   // Sets up the tx_type_map buffer in MACROBLOCKD.
   xd->tx_type_map = txfm_info->tx_type_map_;
   xd->tx_type_map_stride = mi_size_wide[bsize];
@@ -1994,6 +2203,22 @@
     end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
 #endif
   }
+  if (cpi->sf.rt_sf.skip_cdef_sb) {
+    // Find the corresponding 64x64 block. It'll be the 128x128 block if that's
+    // the block size.
+    const int mi_row_sb = mi_row - mi_row % MI_SIZE_64X64;
+    const int mi_col_sb = mi_col - mi_col % MI_SIZE_64X64;
+    MB_MODE_INFO **mi_sb =
+        cm->mi_params.mi_grid_base +
+        get_mi_grid_idx(&cm->mi_params, mi_row_sb, mi_col_sb);
+    // Do not skip if intra or new mv is picked, or color sensitivity is set.
+    mi_sb[0]->skip_cdef_curr_sb =
+        mi_sb[0]->skip_cdef_curr_sb &&
+        !(x->color_sensitivity[0] || x->color_sensitivity[1]) &&
+        !(mbmi->mode < INTRA_MODES || mbmi->mode == NEWMV);
+    // Store in the pickmode context.
+    ctx->mic.skip_cdef_curr_sb = mi_sb[0]->skip_cdef_curr_sb;
+  }
   x->rdmult = orig_rdmult;
   ctx->rd_stats.rate = rd_cost->rate;
   ctx->rd_stats.dist = rd_cost->dist;
@@ -2075,7 +2300,7 @@
   switch (partition) {
     case PARTITION_NONE:
       pc_tree->none = av1_alloc_pmc(cpi, bsize, &td->shared_coeff_buf);
-      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_slipt_check(bsize) &&
+      if (cpi->sf.rt_sf.nonrd_check_partition_split && do_split_check(bsize) &&
           !frame_is_intra_only(cm)) {
         RD_STATS split_rdc, none_rdc, block_rdc;
         RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
@@ -2404,7 +2629,7 @@
     BLOCK_SIZE bsize, FRAME_UPDATE_TYPE frame_update_type, int frame_number,
     const RD_STATS *best_rdc, const char *filename) {
   FILE *f = fopen(filename, "a");
-  fprintf(f, "%d,%d,%d,%d,%d,%d,%ld,%ld,", bsize, frame_number,
+  fprintf(f, "%d,%d,%d,%d,%d,%d,%" PRId64 ",%" PRId64 ",", bsize, frame_number,
           frame_update_type, mi_row, mi_col, best_rdc->rate, best_rdc->dist,
           best_rdc->rdcost);
   for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
@@ -2414,13 +2639,13 @@
     fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
   }
   for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%ld,", part_timing_stats->partition_times[idx]);
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
   }
   for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
     if (part_timing_stats->partition_rdcost[idx] == INT64_MAX) {
       fprintf(f, "%d,", -1);
     } else {
-      fprintf(f, "%ld,", part_timing_stats->partition_rdcost[idx]);
+      fprintf(f, "%" PRId64 ",", part_timing_stats->partition_rdcost[idx]);
     }
   }
   fprintf(f, "\n");
@@ -2439,7 +2664,7 @@
     fprintf(f, "%d,", part_timing_stats->partition_attempts[idx]);
   }
   for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
-    fprintf(f, "%ld,", part_timing_stats->partition_times[idx]);
+    fprintf(f, "%" PRId64 ",", part_timing_stats->partition_times[idx]);
   }
   fprintf(f, "\n");
   fclose(f);
@@ -2534,21 +2759,20 @@
   part_search_state->terminate_partition_search = 0;
   part_search_state->do_square_split = blk_params->bsize_at_least_8x8;
   part_search_state->do_rectangular_split =
-      cpi->oxcf.part_cfg.enable_rect_partitions;
+      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      blk_params->bsize_at_least_8x8;
   av1_zero(part_search_state->prune_rect_part);
 
   // Initialize allowed partition types for the partition block.
   part_search_state->partition_none_allowed =
-      blk_params->has_rows && blk_params->has_cols;
+      av1_blk_has_rows_and_cols(blk_params);
   part_search_state->partition_rect_allowed[HORZ] =
-      blk_params->has_cols && blk_params->bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      part_search_state->do_rectangular_split && blk_params->has_cols &&
       get_plane_block_size(get_partition_subsize(bsize, PARTITION_HORZ),
                            part_search_state->ss_x,
                            part_search_state->ss_y) != BLOCK_INVALID;
   part_search_state->partition_rect_allowed[VERT] =
-      blk_params->has_rows && blk_params->bsize_at_least_8x8 &&
-      cpi->oxcf.part_cfg.enable_rect_partitions &&
+      part_search_state->do_rectangular_split && blk_params->has_rows &&
       get_plane_block_size(get_partition_subsize(bsize, PARTITION_VERT),
                            part_search_state->ss_x,
                            part_search_state->ss_y) != BLOCK_INVALID;
@@ -2607,7 +2831,7 @@
       blk_params.bsize_at_least_8x8 &&
       (blk_params.width > blk_params.min_partition_size_1d);
   part_search_state->partition_none_allowed =
-      blk_params.has_rows && blk_params.has_cols &&
+      av1_blk_has_rows_and_cols(&blk_params) &&
       (blk_params.width >= blk_params.min_partition_size_1d);
   part_search_state->partition_rect_allowed[HORZ] =
       blk_params.has_cols && is_rect_part_allowed &&
@@ -2659,15 +2883,16 @@
 
 // Checks if HORZ / VERT partition search is allowed.
 static AOM_INLINE int is_rect_part_allowed(
-    const AV1_COMP *cpi, PartitionSearchState *part_search_state,
-    active_edge_info *active_edge, RECT_PART_TYPE rect_part, const int mi_pos) {
-  PartitionBlkParams blk_params = part_search_state->part_blk_params;
+    const AV1_COMP *cpi, const PartitionSearchState *part_search_state,
+    const active_edge_info *active_edge, RECT_PART_TYPE rect_part,
+    const int mi_pos) {
+  const PartitionBlkParams *blk_params = &part_search_state->part_blk_params;
   const int is_part_allowed =
       (!part_search_state->terminate_partition_search &&
        part_search_state->partition_rect_allowed[rect_part] &&
        !part_search_state->prune_rect_part[rect_part] &&
        (part_search_state->do_rectangular_split ||
-        active_edge[rect_part](cpi, mi_pos, blk_params.mi_step)));
+        active_edge[rect_part](cpi, mi_pos, blk_params->mi_step)));
   return is_part_allowed;
 }
 
@@ -2842,18 +3067,6 @@
   av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
 }
 
-// Check if AB partitions search is allowed.
-static AOM_INLINE int is_ab_part_allowed(
-    PartitionSearchState *part_search_state,
-    const int ab_partitions_allowed[NUM_AB_PARTS], const int ab_part_type) {
-  const int is_horz_ab = (ab_part_type >> 1);
-  const int is_part_allowed =
-      (!part_search_state->terminate_partition_search &&
-       part_search_state->partition_rect_allowed[is_horz_ab] &&
-       ab_partitions_allowed[ab_part_type]);
-  return is_part_allowed;
-}
-
 // Set mode search context.
 static AOM_INLINE void set_mode_search_ctx(
     PC_TREE *pc_tree, const int is_ctx_ready[NUM_AB_PARTS][2],
@@ -2934,16 +3147,15 @@
   const int mi_col = blk_params.mi_col;
   const int bsize = blk_params.bsize;
 
-  int ab_partitions_allowed[NUM_AB_PARTS] = { 1, 1, 1, 1 };
+  if (part_search_state->terminate_partition_search) {
+    return;
+  }
+
+  int ab_partitions_allowed[NUM_AB_PARTS];
   // Prune AB partitions
-  av1_prune_ab_partitions(
-      cpi, x, pc_tree, bsize, pb_source_variance, best_rdc->rdcost,
-      part_search_state->rect_part_rd, part_search_state->split_rd,
-      rect_part_win_info, ext_partition_allowed,
-      part_search_state->partition_rect_allowed[HORZ],
-      part_search_state->partition_rect_allowed[VERT],
-      &ab_partitions_allowed[HORZ_A], &ab_partitions_allowed[HORZ_B],
-      &ab_partitions_allowed[VERT_A], &ab_partitions_allowed[VERT_B]);
+  av1_prune_ab_partitions(cpi, x, pc_tree, pb_source_variance, best_rdc->rdcost,
+                          rect_part_win_info, ext_partition_allowed,
+                          part_search_state, ab_partitions_allowed);
 
   // Flags to indicate whether the mode search is done.
   const int is_ctx_ready[NUM_AB_PARTS][2] = {
@@ -3000,9 +3212,9 @@
     const PARTITION_TYPE part_type = ab_part_type + PARTITION_HORZ_A;
 
     // Check if the AB partition search is to be performed.
-    if (!is_ab_part_allowed(part_search_state, ab_partitions_allowed,
-                            ab_part_type))
+    if (!ab_partitions_allowed[ab_part_type]) {
       continue;
+    }
 
     blk_params.subsize = get_partition_subsize(bsize, part_type);
     for (int i = 0; i < SUB_PARTITIONS_AB; i++) {
@@ -3013,18 +3225,20 @@
       cur_part_ctxs[ab_part_type][i]->rd_mode_is_ready = 0;
     }
 
-    // We can copy directly the mode search results if we have already searched
-    // the current block and the contexts match.
-    if (is_ctx_ready[ab_part_type][0]) {
-      av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
-                            mode_srch_ctx[ab_part_type][0][0]);
-      cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
-      cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
-      if (is_ctx_ready[ab_part_type][1]) {
-        av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
-                              mode_srch_ctx[ab_part_type][1][0]);
-        cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
-        cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+    if (cpi->sf.part_sf.reuse_prev_rd_results_for_part_ab) {
+      // We can copy directly the mode search results if we have already
+      // searched the current block and the contexts match.
+      if (is_ctx_ready[ab_part_type][0]) {
+        av1_copy_tree_context(cur_part_ctxs[ab_part_type][0],
+                              mode_srch_ctx[ab_part_type][0][0]);
+        cur_part_ctxs[ab_part_type][0]->mic.partition = part_type;
+        cur_part_ctxs[ab_part_type][0]->rd_mode_is_ready = 1;
+        if (is_ctx_ready[ab_part_type][1]) {
+          av1_copy_tree_context(cur_part_ctxs[ab_part_type][1],
+                                mode_srch_ctx[ab_part_type][1][0]);
+          cur_part_ctxs[ab_part_type][1]->mic.partition = part_type;
+          cur_part_ctxs[ab_part_type][1]->rd_mode_is_ready = 1;
+        }
       }
     }
 
@@ -3176,8 +3390,6 @@
     return;
   }
 
-  const int mi_row = blk_params.mi_row;
-  const int mi_col = blk_params.mi_col;
   const int bsize = blk_params.bsize;
   PARTITION_TYPE cur_part[NUM_PART4_TYPES] = { PARTITION_HORZ_4,
                                                PARTITION_VERT_4 };
@@ -3216,11 +3428,9 @@
   if (cpi->sf.part_sf.ml_prune_partition && partition4_allowed &&
       part_search_state->partition_rect_allowed[HORZ] &&
       part_search_state->partition_rect_allowed[VERT]) {
-    av1_ml_prune_4_partition(
-        cpi, x, bsize, pc_tree->partitioning, best_rdc->rdcost,
-        part_search_state->rect_part_rd, part_search_state->split_rd,
-        &part4_search_allowed[HORZ4], &part4_search_allowed[VERT4],
-        pb_source_variance, mi_row, mi_col);
+    av1_ml_prune_4_partition(cpi, x, pc_tree->partitioning, best_rdc->rdcost,
+                             part_search_state, part4_search_allowed,
+                             pb_source_variance);
   }
 
   // Pruning: pruning out 4-way partitions based on the number of horz/vert wins
@@ -3229,21 +3439,6 @@
                                      part4_search_allowed);
 }
 
-// Set PARTITION_NONE allowed flag.
-static AOM_INLINE void set_part_none_allowed_flag(
-    AV1_COMP *const cpi, PartitionSearchState *part_search_state) {
-  PartitionBlkParams blk_params = part_search_state->part_blk_params;
-  if ((blk_params.width <= blk_params.min_partition_size_1d) &&
-      blk_params.has_rows && blk_params.has_cols)
-    part_search_state->partition_none_allowed = 1;
-  assert(part_search_state->terminate_partition_search == 0);
-
-  // Set PARTITION_NONE for screen content.
-  if (cpi->use_screen_content_tools)
-    part_search_state->partition_none_allowed =
-        blk_params.has_rows && blk_params.has_cols;
-}
-
 // Set params needed for PARTITION_NONE search.
 static void set_none_partition_params(const AV1_COMP *const cpi, ThreadData *td,
                                       MACROBLOCK *x, PC_TREE *pc_tree,
@@ -3282,8 +3477,7 @@
                                         unsigned int *pb_source_variance) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  PartitionBlkParams blk_params = part_search_state->part_blk_params;
-  const CommonModeInfoParams *const mi_params = &cm->mi_params;
+  const PartitionBlkParams blk_params = part_search_state->part_blk_params;
   RD_STATS *this_rdc = &part_search_state->this_rdc;
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
@@ -3296,11 +3490,8 @@
         bsize <= cpi->sf.part_sf.use_square_partition_only_threshold &&
         bsize > BLOCK_4X4 && cpi->sf.part_sf.ml_predict_breakout_level >= 1;
     if (use_ml_based_breakout) {
-      if (av1_ml_predict_breakout(cpi, bsize, x, this_rdc, *pb_source_variance,
-                                  xd->bd)) {
-        part_search_state->do_square_split = 0;
-        part_search_state->do_rectangular_split = 0;
-      }
+      av1_ml_predict_breakout(cpi, x, this_rdc, *pb_source_variance, xd->bd,
+                              part_search_state);
     }
 
     // Adjust dist breakout threshold according to the partition size.
@@ -3328,15 +3519,13 @@
   // decision on early terminating at PARTITION_NONE.
   if (cpi->sf.part_sf.simple_motion_search_early_term_none && cm->show_frame &&
       !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
-      blk_params.mi_row_edge < mi_params->mi_rows &&
-      blk_params.mi_col_edge < mi_params->mi_cols &&
-      this_rdc->rdcost < INT64_MAX && this_rdc->rdcost >= 0 &&
-      this_rdc->rate < INT_MAX && this_rdc->rate >= 0 &&
+      av1_blk_has_rows_and_cols(&blk_params) && this_rdc->rdcost < INT64_MAX &&
+      this_rdc->rdcost >= 0 && this_rdc->rate < INT_MAX &&
+      this_rdc->rate >= 0 &&
       (part_search_state->do_square_split ||
        part_search_state->do_rectangular_split)) {
-    av1_simple_motion_search_early_term_none(
-        cpi, x, sms_tree, blk_params.mi_row, blk_params.mi_col, bsize, this_rdc,
-        &part_search_state->terminate_partition_search);
+    av1_simple_motion_search_early_term_none(cpi, x, sms_tree, this_rdc,
+                                             part_search_state);
   }
 }
 
@@ -3362,9 +3551,8 @@
       (part_search_state->partition_rect_allowed[HORZ] ||
        part_search_state->partition_rect_allowed[VERT])) {
     av1_ml_early_term_after_split(
-        cpi, x, sms_tree, bsize, best_rdc->rdcost, part_none_rd, part_split_rd,
-        part_search_state->split_rd, mi_row, mi_col,
-        &part_search_state->terminate_partition_search);
+        cpi, x, sms_tree, best_rdc->rdcost, part_none_rd, part_split_rd,
+        part_search_state->split_rd, part_search_state);
   }
 
   // Use the rd costs of PARTITION_NONE and subblocks from PARTITION_SPLIT
@@ -3378,10 +3566,9 @@
       !part_search_state->terminate_partition_search) {
     av1_setup_src_planes(x, cpi->source, mi_row, mi_col, av1_num_planes(cm),
                          bsize);
-    av1_ml_prune_rect_partition(
-        cpi, x, bsize, best_rdc->rdcost, part_search_state->none_rd,
-        part_search_state->split_rd, &part_search_state->prune_rect_part[HORZ],
-        &part_search_state->prune_rect_part[VERT]);
+    av1_ml_prune_rect_partition(cpi, x, best_rdc->rdcost,
+                                part_search_state->none_rd,
+                                part_search_state->split_rd, part_search_state);
   }
 }
 
@@ -3400,8 +3587,6 @@
   const BLOCK_SIZE bsize = blk_params.bsize;
   assert(bsize < BLOCK_SIZES_ALL);
 
-  // Set PARTITION_NONE allowed flag.
-  set_part_none_allowed_flag(cpi, part_search_state);
   if (!part_search_state->partition_none_allowed) return;
 
   int pt_cost = 0;
@@ -3452,7 +3637,7 @@
     if (cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions) {
       const int ref_type = av1_ref_frame_type(pc_tree->none->mic.ref_frame);
       av1_update_picked_ref_frames_mask(
-          x, ref_type, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+          x, ref_type, bsize, cm->seq_params->mib_size, mi_row, mi_col);
     }
 
     // Calculate the total cost and update the best partition.
@@ -3603,21 +3788,26 @@
   av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm));
 }
 
+// The max number of nodes in the partition tree.
+// The number of leaf nodes is (128x128) / (4x4) = 1024.
+// The number of All possible parent nodes is 1 + 2 + ... + 512 = 1023.
+#define NUM_NODES 2048
+
 static void write_partition_tree(AV1_COMP *const cpi,
                                  const PC_TREE *const pc_tree,
                                  const BLOCK_SIZE bsize, const int mi_row,
                                  const int mi_col) {
   (void)mi_row;
   (void)mi_col;
-  char filename[128];
-  snprintf(filename, sizeof(filename), "partition_tree_sb%d_c%d",
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
            cpi->sb_counter, 0);
-  ++cpi->sb_counter;
   FILE *pfile = fopen(filename, "w");
   fprintf(pfile, "%d", bsize);
 
   // Write partition type with BFS order.
-  const PC_TREE *tree_node_queue[1024] = { NULL };
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
   int q_idx = 0;
   int depth = 0;
   int last_idx = 1;
@@ -3673,14 +3863,15 @@
                                         const int mi_col) {
   (void)mi_row;
   (void)mi_col;
-  char filename[128];
-  snprintf(filename, sizeof(filename), "verify_partition_tree_sb%d_c%d",
-           cpi->sb_counter - 1, config_id);
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/verify_partition_tree_sb%d_c%d",
+           path, cpi->sb_counter, config_id);
   FILE *pfile = fopen(filename, "w");
   fprintf(pfile, "%d", bsize);
 
   // Write partition type with BFS order.
-  const PC_TREE *tree_node_queue[1024] = { NULL };
+  const PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
   int q_idx = 0;
   int depth = 0;
   int last_idx = 1;
@@ -3733,8 +3924,9 @@
 
 static int read_partition_tree(AV1_COMP *const cpi, PC_TREE *const pc_tree,
                                const int config_id) {
-  char filename[128];
-  snprintf(filename, sizeof(filename), "partition_tree_sb%d_c%d",
+  const char *path = cpi->oxcf.partition_info_path;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/partition_tree_sb%d_c%d", path,
            cpi->sb_counter, config_id);
   FILE *pfile = fopen(filename, "r");
   if (pfile == NULL) {
@@ -3746,10 +3938,11 @@
   int num_nodes;
   int num_configs;
   fscanf(pfile, "%d,%d,%d", &read_bsize, &num_nodes, &num_configs);
-  assert(read_bsize == cpi->common.seq_params.sb_size);
+  assert(read_bsize == cpi->common.seq_params->sb_size);
   BLOCK_SIZE bsize = (BLOCK_SIZE)read_bsize;
+  assert(bsize == pc_tree->block_size);
 
-  PC_TREE *tree_node_queue[1024] = { NULL };
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
   int last_idx = 1;
   int q_idx = 0;
   tree_node_queue[q_idx] = pc_tree;
@@ -3759,7 +3952,10 @@
     assert(partitioning >= PARTITION_NONE &&
            partitioning < EXT_PARTITION_TYPES);
     PC_TREE *node = tree_node_queue[q_idx];
-    if (node != NULL) node->partitioning = partitioning;
+    if (node != NULL) {
+      node->partitioning = partitioning;
+      bsize = node->block_size;
+    }
     if (partitioning == PARTITION_SPLIT) {
       const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
       for (int i = 0; i < 4; ++i) {
@@ -3770,7 +3966,6 @@
           ++last_idx;
         }
       }
-      bsize = subsize;
     }
     --num_nodes;
     ++q_idx;
@@ -3797,6 +3992,11 @@
   PartitionSearchState part_search_state;
   init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
                                      bsize);
+  // Override partition costs at the edges of the frame in the same
+  // way as in read_partition (see decodeframe.c).
+  PartitionBlkParams blk_params = part_search_state.part_blk_params;
+  if (!av1_blk_has_rows_and_cols(&blk_params))
+    set_partition_cost_for_edge_blk(cm, &part_search_state);
 
   av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
@@ -3889,14 +4089,14 @@
       best_rdc.rate = sum_subblock_rate;
       best_rdc.rate += part_search_state.partition_cost[PARTITION_SPLIT];
       best_rdc.dist = sum_subblock_dist;
-      av1_rd_cost_update(x->rdmult, &best_rdc);
+      best_rdc.rdcost = RDCOST(x->rdmult, best_rdc.rate, best_rdc.dist);
       break;
     default: assert(0 && "invalid partition type."); exit(0);
   }
   // Note: it is necessary to restore context information.
   av1_restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
 
-  if (bsize != cm->seq_params.sb_size) {
+  if (bsize != cm->seq_params->sb_size) {
     encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
               pc_tree, NULL);
   }
@@ -3905,11 +4105,272 @@
   return best_rdc;
 }
 
+static void prepare_sb_features_before_search(
+    AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row,
+    int mi_col, const BLOCK_SIZE bsize, aom_partition_features_t *features) {
+  av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                        bsize, features);
+  collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, features);
+}
+
+static void update_partition_stats(const RD_STATS *const this_rdcost,
+                                   aom_partition_stats_t *stats) {
+  stats->rate = this_rdcost->rate;
+  stats->dist = this_rdcost->dist;
+  stats->rdcost = this_rdcost->rdcost;
+}
+
+static void build_pc_tree_from_part_decision(
+    const aom_partition_decision_t *partition_decision,
+    const BLOCK_SIZE this_bsize, PC_TREE *pc_tree) {
+  BLOCK_SIZE bsize = this_bsize;
+  int num_nodes = partition_decision->num_nodes;
+  PC_TREE *tree_node_queue[NUM_NODES] = { NULL };
+  int last_idx = 1;
+  int q_idx = 0;
+  tree_node_queue[q_idx] = pc_tree;
+  while (num_nodes > 0) {
+    const int partitioning = partition_decision->partition_decision[q_idx];
+    assert(partitioning >= PARTITION_NONE &&
+           partitioning < EXT_PARTITION_TYPES);
+    PC_TREE *node = tree_node_queue[q_idx];
+    if (node != NULL) node->partitioning = partitioning;
+    if (partitioning == PARTITION_SPLIT) {
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      for (int i = 0; i < 4; ++i) {
+        if (node != NULL) {  // Suppress warning
+          node->split[i] = av1_alloc_pc_tree_node(subsize);
+          node->split[i]->index = i;
+          tree_node_queue[last_idx] = node->split[i];
+          ++last_idx;
+        }
+      }
+      bsize = subsize;
+    }
+    --num_nodes;
+    ++q_idx;
+  }
+}
+
+// The ML model needs to provide the whole decision tree for the superblock.
+static bool ml_partition_search_whole_tree(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           TokenExtra **tp,
+                                           SIMPLE_MOTION_DATA_TREE *sms_root,
+                                           int mi_row, int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+  PC_TREE *pc_tree;
+
+  // rd mode search (dry run) for a valid partition decision from the ml model.
+  aom_partition_decision_t partition_decision;
+  do {
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+
+    // First, let's take the easy approach.
+    // We require that the ml model has to provide partition decisions for the
+    // whole superblock.
+    pc_tree = av1_alloc_pc_tree_node(bsize);
+    build_pc_tree_from_part_decision(&partition_decision, bsize, pc_tree);
+
+    const RD_STATS this_rdcost = rd_search_for_fixed_partition(
+        cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+    aom_partition_stats_t stats;
+    update_partition_stats(&this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+    }
+  } while (!partition_decision.is_final_decision);
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+
+  return true;
+}
+
+static bool recursive_partition(AV1_COMP *const cpi, ThreadData *td,
+                                TileDataEnc *tile_data, TokenExtra **tp,
+                                SIMPLE_MOTION_DATA_TREE *sms_root,
+                                PC_TREE *pc_tree, int mi_row, int mi_col,
+                                const BLOCK_SIZE bsize, RD_STATS *this_rdcost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  MACROBLOCK *const x = &td->mb;
+  if (mi_row >= cm->mi_params.mi_rows || mi_col >= cm->mi_params.mi_cols) {
+    return false;
+  }
+  aom_partition_decision_t partition_decision;
+  do {
+    aom_partition_features_t features;
+    features.mi_row = mi_row;
+    features.mi_col = mi_col;
+    features.frame_width = cpi->frame_info.frame_width;
+    features.frame_height = cpi->frame_info.frame_height;
+    features.block_size = bsize;
+    av1_ext_part_send_features(ext_part_controller, &features);
+    const bool valid_decision = av1_ext_part_get_partition_decision(
+        ext_part_controller, &partition_decision);
+    if (!valid_decision) return false;
+    pc_tree->partitioning = partition_decision.current_decision;
+    PartitionSearchState part_search_state;
+
+    // Initialization of state variables used in partition search.
+    // TODO(chengchen): check if there is hidden conditions that don't allow
+    // all possible partition types.
+    init_partition_search_state_params(x, cpi, &part_search_state, mi_row,
+                                       mi_col, bsize);
+    // Override partition costs at the edges of the frame in the same
+    // way as in read_partition (see decodeframe.c).
+    PartitionBlkParams blk_params = part_search_state.part_blk_params;
+    if (!av1_blk_has_rows_and_cols(&blk_params))
+      set_partition_cost_for_edge_blk(cm, &part_search_state);
+
+    av1_init_rd_stats(this_rdcost);
+    if (partition_decision.current_decision == PARTITION_SPLIT) {
+      assert(block_size_wide[bsize] >= 8 && block_size_high[bsize] >= 8);
+      const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+      RD_STATS split_rdc[SUB_PARTITIONS_SPLIT];
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        av1_init_rd_stats(&split_rdc[i]);
+        if (pc_tree->split[i] == NULL)
+          pc_tree->split[i] = av1_alloc_pc_tree_node(subsize);
+        pc_tree->split[i]->index = i;
+      }
+      const int orig_rdmult = x->rdmult;
+      setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
+      (void)orig_rdmult;
+      // TODO(chengchen): check boundary conditions
+      // top-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[0],
+                          mi_row, mi_col, subsize, &split_rdc[0]);
+      // top-right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[1],
+                          mi_row, mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[1]);
+      // bottom-left
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[2],
+                          mi_row + mi_size_high[subsize], mi_col, subsize,
+                          &split_rdc[2]);
+      // bottom_right
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree->split[3],
+                          mi_row + mi_size_high[subsize],
+                          mi_col + mi_size_wide[subsize], subsize,
+                          &split_rdc[3]);
+      this_rdcost->rate += part_search_state.partition_cost[PARTITION_SPLIT];
+      // problem is here, the rdmult is different from the rdmult in sub block.
+      for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+        this_rdcost->rate += split_rdc[i].rate;
+        this_rdcost->dist += split_rdc[i].dist;
+        av1_rd_cost_update(x->rdmult, this_rdcost);
+      }
+      x->rdmult = orig_rdmult;
+    } else {
+      *this_rdcost = rd_search_for_fixed_partition(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize, pc_tree);
+    }
+
+    aom_partition_stats_t stats;
+    update_partition_stats(this_rdcost, &stats);
+    av1_ext_part_send_partition_stats(ext_part_controller, &stats);
+    if (!partition_decision.is_final_decision) {
+      if (partition_decision.current_decision == PARTITION_SPLIT) {
+        for (int i = 0; i < 4; ++i) {
+          if (pc_tree->split[i] != NULL) {
+            av1_free_pc_tree_recursive(pc_tree->split[i], av1_num_planes(cm), 0,
+                                       0);
+            pc_tree->split[i] = NULL;
+          }
+        }
+      }
+    }
+  } while (!partition_decision.is_final_decision);
+
+  return true;
+}
+
+// The ML model only needs to make decisions for the current block each time.
+static bool ml_partition_search_partial(AV1_COMP *const cpi, ThreadData *td,
+                                        TileDataEnc *tile_data, TokenExtra **tp,
+                                        SIMPLE_MOTION_DATA_TREE *sms_root,
+                                        int mi_row, int mi_col,
+                                        const BLOCK_SIZE bsize) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  aom_partition_features_t features;
+  prepare_sb_features_before_search(cpi, td, tile_data, mi_row, mi_col, bsize,
+                                    &features);
+  features.mi_row = mi_row;
+  features.mi_col = mi_col;
+  features.frame_width = cpi->frame_info.frame_width;
+  features.frame_height = cpi->frame_info.frame_height;
+  features.block_size = bsize;
+  av1_ext_part_send_features(ext_part_controller, &features);
+  PC_TREE *pc_tree;
+  pc_tree = av1_alloc_pc_tree_node(bsize);
+
+  RD_STATS rdcost;
+  const bool valid_partition =
+      recursive_partition(cpi, td, tile_data, tp, sms_root, pc_tree, mi_row,
+                          mi_col, bsize, &rdcost);
+  if (!valid_partition) {
+    return false;
+  }
+
+  // Encode with the selected mode and partition.
+  set_cb_offsets(x->cb_offset, 0, 0);
+  encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+            pc_tree, NULL);
+
+  av1_free_pc_tree_recursive(pc_tree, av1_num_planes(cm), 0, 0);
+
+  return true;
+}
+
 bool av1_rd_partition_search(AV1_COMP *const cpi, ThreadData *td,
                              TileDataEnc *tile_data, TokenExtra **tp,
                              SIMPLE_MOTION_DATA_TREE *sms_root, int mi_row,
                              int mi_col, const BLOCK_SIZE bsize,
                              RD_STATS *best_rd_cost) {
+  if (cpi->ext_part_controller.ready) {
+    bool valid_search = true;
+    const aom_ext_part_decision_mode_t decision_mode =
+        av1_get_ext_part_decision_mode(&cpi->ext_part_controller);
+    if (decision_mode == AOM_EXT_PART_WHOLE_TREE) {
+      valid_search = ml_partition_search_whole_tree(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else if (decision_mode == AOM_EXT_PART_RECURSIVE) {
+      valid_search = ml_partition_search_partial(
+          cpi, td, tile_data, tp, sms_root, mi_row, mi_col, bsize);
+    } else {
+      assert(0 && "Unknown decision mode.");
+      return false;
+    }
+    if (!valid_search) {
+      assert(0 && "Invalid search from ML model, partition search failed.");
+      exit(0);
+    }
+    return true;
+  }
+
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   int best_idx = 0;
@@ -3960,6 +4421,53 @@
   return true;
 }
 
+DECLARE_ALIGNED(16, static const uint8_t, all_zeros[MAX_SB_SIZE]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+                highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+static void log_sub_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs,
+                              double *var_min, double *var_max) {
+  // This functions returns a the minimum and maximum log variances for 4x4
+  // sub blocks in the current block.
+
+  MACROBLOCKD *xd = &x->e_mbd;
+  double var;
+  unsigned int sse;
+  int i, j;
+
+  int right_overflow =
+      (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+  int bottom_overflow =
+      (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+  const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+  const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+  // Initialize min to a large value and max to 0 at
+  *var_min = 10.0;
+  *var_max = 0.0;
+
+  for (i = 0; i < bh; i += 4) {
+    for (j = 0; j < bw; j += 4) {
+      if (is_cur_buf_hbd(xd)) {
+        var =
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride,
+                          CONVERT_TO_BYTEPTR(highbd_all_zeros), 0, &sse) /
+                          16);
+      } else {
+        var =
+            log(1.0 + cpi->ppi->fn_ptr[BLOCK_4X4].vf(
+                          x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+                          x->plane[0].src.stride, all_zeros, 0, &sse) /
+                          16);
+      }
+      *var_min = AOMMIN(*var_min, var);
+      *var_max = AOMMAX(*var_max, var);
+    }
+  }
+}
+
 /*!\brief AV1 block partition search (full search).
 *
 * \ingroup partition_search
@@ -4014,6 +4522,7 @@
   RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   const TokenExtra *const tp_orig = *tp;
   PartitionSearchState part_search_state;
+
   // Initialization of state variables used in partition search.
   init_partition_search_state_params(x, cpi, &part_search_state, mi_row, mi_col,
                                      bsize);
@@ -4024,7 +4533,7 @@
     av1_invalid_rd_stats(rd_cost);
     return part_search_state.found_best_partition;
   }
-  if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+  if (bsize == cm->seq_params->sb_size) x->must_find_valid_partition = 0;
 
   // Override skipping rectangular partition operations for edge blocks.
   if (none_rd) *none_rd = 0;
@@ -4040,7 +4549,7 @@
 
   // Override partition costs at the edges of the frame in the same
   // way as in read_partition (see decodeframe.c).
-  if (!(blk_params.has_rows && blk_params.has_cols))
+  if (!av1_blk_has_rows_and_cols(&blk_params))
     set_partition_cost_for_edge_blk(cm, &part_search_state);
 
   // Disable rectangular partitions for inner blocks when the current block is
@@ -4069,6 +4578,18 @@
   const int orig_rdmult = x->rdmult;
   setup_block_rdmult(cpi, x, mi_row, mi_col, bsize, NO_AQ, NULL);
 
+  // Apply simple motion search for the entire super block with fixed block
+  // size, e.g., 16x16, to collect features and write to files for the
+  // external ML model.
+  // TODO(chengchen): reduce motion search. This function is similar to
+  // av1_get_max_min_partition_features().
+  if (COLLECT_MOTION_SEARCH_FEATURE_SB && !frame_is_intra_only(cm) &&
+      bsize == cm->seq_params->sb_size) {
+    av1_collect_motion_search_features_sb(cpi, td, tile_data, mi_row, mi_col,
+                                          bsize, /*features=*/NULL);
+    collect_tpl_stats_sb(cpi, bsize, mi_row, mi_col, /*features=*/NULL);
+  }
+
   // Update rd cost of the bound using the current multiplier.
   av1_rd_cost_update(x->rdmult, &best_rdc);
 
@@ -4085,24 +4606,13 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, av1_prune_partitions_time);
 #endif
-  int *partition_horz_allowed = &part_search_state.partition_rect_allowed[HORZ];
-  int *partition_vert_allowed = &part_search_state.partition_rect_allowed[VERT];
-  int *prune_horz = &part_search_state.prune_rect_part[HORZ];
-  int *prune_vert = &part_search_state.prune_rect_part[VERT];
   // Pruning: before searching any partition type, using source and simple
   // motion search results to prune out unlikely partitions.
-  av1_prune_partitions_before_search(
-      cpi, x, mi_row, mi_col, bsize, sms_tree,
-      &part_search_state.partition_none_allowed, partition_horz_allowed,
-      partition_vert_allowed, &part_search_state.do_rectangular_split,
-      &part_search_state.do_square_split, prune_horz, prune_vert);
+  av1_prune_partitions_before_search(cpi, x, sms_tree, &part_search_state);
 
   // Pruning: eliminating partition types leading to coding block sizes outside
   // the min and max bsize limitations set from the encoder.
-  av1_prune_partitions_by_max_min_bsize(
-      &x->sb_enc, bsize, blk_params.has_rows && blk_params.has_cols,
-      &part_search_state.partition_none_allowed, partition_horz_allowed,
-      partition_vert_allowed, &part_search_state.do_square_split);
+  av1_prune_partitions_by_max_min_bsize(&x->sb_enc, &part_search_state);
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_prune_partitions_time);
 #endif
@@ -4126,11 +4636,37 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, none_partition_search_time);
 #endif
+
+  // Further pruning or in some cases reverse pruning when allintra is set
+  // This code helps visual and in some cases metrics quality where the current
+  // block comprises at least one very low variance sub-block and at least one
+  // where the variance is much higher.
+  //
+  // The idea is that in such cases there is danger of ringing and other visual
+  // artifacts from a high variance feature such as an edge into a very low
+  // variance region.
+  //
+  // The approach taken is to force break down / split to a smaller block size
+  // to try and separate out the low variance and well predicted blocks from the
+  // more complex ones and to prevent propagation of ringing over a large
+  // region.
+  if ((cpi->oxcf.mode == ALLINTRA) && (bsize >= BLOCK_16X16)) {
+    double var_min, var_max;
+    log_sub_block_var(cpi, x, bsize, &var_min, &var_max);
+
+    if ((var_min < 0.5) && ((var_max - var_min) > 3.0)) {
+      part_search_state.partition_none_allowed = 0;
+      part_search_state.terminate_partition_search = 0;
+      part_search_state.do_square_split = 1;
+    }
+  }
+
   // PARTITION_NONE search stage.
   int64_t part_none_rd = INT64_MAX;
   none_partition_search(cpi, td, tile_data, x, pc_tree, sms_tree, &x_ctx,
                         &part_search_state, &best_rdc, &pb_source_variance,
                         none_rd, &part_none_rd);
+
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, none_partition_search_time);
 #endif
@@ -4149,10 +4685,21 @@
   // when NONE and SPLIT partition rd_costs are INT64_MAX.
   if (cpi->sf.part_sf.early_term_after_none_split &&
       part_none_rd == INT64_MAX && part_split_rd == INT64_MAX &&
-      !x->must_find_valid_partition && (bsize != cm->seq_params.sb_size)) {
+      !x->must_find_valid_partition && (bsize != cm->seq_params->sb_size)) {
     part_search_state.terminate_partition_search = 1;
   }
 
+  // Do not evaluate non-square partitions if NONE partition did not choose a
+  // newmv mode and is skippable.
+  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 2) &&
+      (pc_tree->none != NULL)) {
+    if (x->qindex <= 200 && is_inter_mode(pc_tree->none->mic.mode) &&
+        !have_newmv_in_inter_mode(pc_tree->none->mic.mode) &&
+        pc_tree->none->skippable && !x->must_find_valid_partition &&
+        bsize >= BLOCK_16X16)
+      part_search_state.do_rectangular_split = 0;
+  }
+
   // Prune partitions based on PARTITION_NONE and PARTITION_SPLIT.
   prune_partitions_after_split(cpi, x, sms_tree, &part_search_state, &best_rdc,
                                part_none_rd, part_split_rd);
@@ -4181,10 +4728,18 @@
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                  !part_search_state.do_rectangular_split));
 
-  const int ext_partition_allowed =
+  int ext_partition_allowed =
       part_search_state.do_rectangular_split &&
       bsize > cpi->sf.part_sf.ext_partition_eval_thresh &&
-      blk_params.has_rows && blk_params.has_cols;
+      av1_blk_has_rows_and_cols(&blk_params);
+
+  // Do not evaluate extended partitions if NONE partition is skippable.
+  if ((cpi->sf.part_sf.skip_non_sq_part_based_on_none >= 1) &&
+      (pc_tree->none != NULL)) {
+    if (pc_tree->none->skippable && !x->must_find_valid_partition &&
+        bsize >= BLOCK_16X16)
+      ext_partition_allowed = 0;
+  }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   start_timing(cpi, ab_partitions_search_time);
 #endif
@@ -4211,9 +4766,7 @@
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                  !part4_search_allowed[HORZ4]));
   if (!part_search_state.terminate_partition_search &&
-      part4_search_allowed[HORZ4] && blk_params.has_rows &&
-      (part_search_state.do_rectangular_split ||
-       av1_active_h_edge(cpi, mi_row, blk_params.mi_step))) {
+      part4_search_allowed[HORZ4]) {
     const int inc_step[NUM_PART4_TYPES] = { mi_size_high[blk_params.bsize] / 4,
                                             0 };
     // Evaluation of Horz4 partition type.
@@ -4226,9 +4779,7 @@
   assert(IMPLIES(!cpi->oxcf.part_cfg.enable_rect_partitions,
                  !part4_search_allowed[VERT4]));
   if (!part_search_state.terminate_partition_search &&
-      part4_search_allowed[VERT4] && blk_params.has_cols &&
-      (part_search_state.do_rectangular_split ||
-       av1_active_v_edge(cpi, mi_row, blk_params.mi_step))) {
+      part4_search_allowed[VERT4] && blk_params.has_cols) {
     const int inc_step[NUM_PART4_TYPES] = { 0, mi_size_wide[blk_params.bsize] /
                                                    4 };
     // Evaluation of Vert4 partition type.
@@ -4240,7 +4791,7 @@
   end_timing(cpi, rd_pick_4partition_time);
 #endif
 
-  if (bsize == cm->seq_params.sb_size &&
+  if (bsize == cm->seq_params->sb_size &&
       !part_search_state.found_best_partition) {
     // Did not find a valid partition, go back and search again, with less
     // constraint on which partition types to search.
@@ -4267,7 +4818,7 @@
   // prediction block.
   print_partition_timing_stats_with_rdcost(
       part_timing_stats, mi_row, mi_col, bsize,
-      cpi->gf_group.update_type[cpi->gf_frame_index],
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index],
       cm->current_frame.frame_number, &best_rdc, "part_timing.csv");
   /*
   print_partition_timing_stats(part_timing_stats, cm->show_frame,
@@ -4289,13 +4840,16 @@
   // If a valid partition is found and reconstruction is required for future
   // sub-blocks in the same group.
   if (part_search_state.found_best_partition && pc_tree->index != 3) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // Encode the superblock.
       const int emit_output = multi_pass_mode != SB_DRY_PASS;
       const RUN_TYPE run_type = emit_output ? OUTPUT_ENABLED : DRY_RUN_NORMAL;
 
       // Write partition tree to file. Not used by default.
-      if (0) write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+      if (COLLECT_MOTION_SEARCH_FEATURE_SB) {
+        write_partition_tree(cpi, pc_tree, bsize, mi_row, mi_col);
+        ++cpi->sb_counter;
+      }
 
       set_cb_offsets(x->cb_offset, 0, 0);
       encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, run_type, bsize,
@@ -4318,7 +4872,7 @@
   if (pc_tree_dealloc == 0)
     av1_free_pc_tree_recursive(pc_tree, num_planes, 1, 1);
 
-  if (bsize == cm->seq_params.sb_size) {
+  if (bsize == cm->seq_params->sb_size) {
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
   } else {
@@ -4331,12 +4885,14 @@
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
+#undef COLLECT_MOTION_SEARCH_FEATURE_SB
+
 #if CONFIG_RT_ML_PARTITIONING
 #define FEATURES 6
 #define LABELS 2
-static int ml_predict_var_paritioning(AV1_COMP *cpi, MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, int mi_row,
-                                      int mi_col) {
+static int ml_predict_var_partitioning(AV1_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
   const float *means = NULL;
@@ -4363,13 +4919,11 @@
 
   if (!nn_config) return -1;
 
-  aom_clear_system_state();
-
   {
     const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
     float features[FEATURES] = { 0.0f };
     const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                                      cm->seq_params.bit_depth);
+                                      cm->seq_params->bit_depth);
     int feature_idx = 0;
     float score[LABELS];
 
@@ -4391,7 +4945,7 @@
       int i;
       // Variance of whole block.
       const unsigned int var =
-          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+          cpi->ppi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
       const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
 
       features[feature_idx] = (logf((float)var + 1.0f) - means[feature_idx]) /
@@ -4404,8 +4958,8 @@
         const int pred_offset = y_idx * pred_stride + x_idx;
         // Variance of quarter block.
         const unsigned int sub_var =
-            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
-                                    pred + pred_offset, pred_stride, &sse);
+            cpi->ppi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                         pred + pred_offset, pred_stride, &sse);
         const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
         features[feature_idx] =
             (var_ratio - means[feature_idx]) / sqrtf(vars[feature_idx]);
@@ -4445,11 +4999,9 @@
 
   FILE *f = fopen(fname, "a");
 
-  aom_clear_system_state();
-
   {
     const int dc_q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                                      cm->seq_params.bit_depth);
+                                      cm->seq_params->bit_depth);
     int feature_idx = 0;
 
     features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
@@ -4597,7 +5149,7 @@
   int partition_none_allowed = !force_horz_split && !force_vert_split;
 
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);  // Square partition only
-  assert(cm->seq_params.sb_size == BLOCK_64X64);       // Small SB so far
+  assert(cm->seq_params->sb_size == BLOCK_64X64);      // Small SB so far
 
   (void)*tp_orig;
 
@@ -4606,7 +5158,7 @@
 #ifndef _COLLECT_GROUND_TRUTH_
   if (partition_none_allowed && do_split) {
     const int ml_predicted_partition =
-        ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+        ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
     if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
     if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
   }
@@ -4704,7 +5256,7 @@
   fill_mode_info_sb(cpi, x, mi_row, mi_col, bsize, pc_tree);
 
   if (do_recon) {
-    if (bsize == cm->seq_params.sb_size) {
+    if (bsize == cm->seq_params->sb_size) {
       // NOTE: To get estimate for rate due to the tokens, use:
       // int rate_coeffs = 0;
       // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,

diff --git a/av1/encoder/partition_search.h b/av1/encoder/partition_search.h
index 2d686dc..8a67176 100644
--- a/av1/encoder/partition_search.h
+++ b/av1/encoder/partition_search.h

@@ -64,12 +64,14 @@
 static AOM_INLINE void update_cb_offsets(MACROBLOCK *x, const BLOCK_SIZE bsize,
                                          const int subsampling_x,
                                          const int subsampling_y) {
-  const BLOCK_SIZE plane_bsize =
-      get_plane_block_size(bsize, subsampling_x, subsampling_y);
   x->cb_offset[PLANE_TYPE_Y] += block_size_wide[bsize] * block_size_high[bsize];
-  if (x->e_mbd.is_chroma_ref)
+  if (x->e_mbd.is_chroma_ref) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, subsampling_x, subsampling_y);
+    assert(plane_bsize != BLOCK_INVALID);
     x->cb_offset[PLANE_TYPE_UV] +=
         block_size_wide[plane_bsize] * block_size_high[plane_bsize];
+  }
 }
 
 #endif  // AOM_AV1_ENCODER_PARTITION_SEARCH_H_

diff --git a/av1/encoder/partition_strategy.c b/av1/encoder/partition_strategy.c
index 0f0da7e..269bbfd 100644
--- a/av1/encoder/partition_strategy.c
+++ b/av1/encoder/partition_strategy.c

@@ -11,10 +11,9 @@
 
 #include <float.h>
 
+#include "av1/encoder/encodeframe_utils.h"
 #include "config/aom_dsp_rtcd.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/enums.h"
 #include "av1/common/reconinter.h"
 
@@ -27,6 +26,7 @@
 
 #include "av1/encoder/motion_search_facade.h"
 #include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/partition_search.h"
 #include "av1/encoder/rdopt.h"
 
 #if !CONFIG_REALTIME_ONLY
@@ -35,6 +35,48 @@
     int mi_row, int mi_col, BLOCK_SIZE bsize, float *features,
     int features_to_get);
 
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split);
+
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert);
+
+static bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split);
+
+static bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search);
+
+static bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert);
+
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed);
+
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col);
+
 static INLINE int convert_bsize_to_idx(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_128X128: return 0;
@@ -45,9 +87,45 @@
     default: assert(0 && "Invalid bsize"); return -1;
   }
 }
-#endif
 
-#if !CONFIG_REALTIME_ONLY
+static char *get_feature_file_name(int id) {
+  static char *feature_file_names[] = {
+    "feature_before_partition_none",
+    "feature_before_partition_none_prune_rect",
+    "feature_after_partition_none_prune",
+    "feature_after_partition_none_terminate",
+    "feature_after_partition_split_terminate",
+    "feature_after_partition_split_prune_rect",
+    "feature_after_partition_rect",
+    "feature_after_partition_ab",
+  };
+
+  return feature_file_names[id];
+}
+
+static void write_features_to_file(const char *const path,
+                                   const bool is_test_mode,
+                                   const float *features,
+                                   const int feature_size, const int id,
+                                   const int bsize, const int mi_row,
+                                   const int mi_col) {
+  if (!WRITE_FEATURE_TO_FILE && !is_test_mode) return;
+
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/%s", path,
+           get_feature_file_name(id));
+  FILE *pfile = fopen(filename, "a");
+  if (!is_test_mode) {
+    fprintf(pfile, "%d,%d,%d,%d,%d\n", id, bsize, mi_row, mi_col, feature_size);
+  }
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
 // TODO(chiyotsai@google.com): This is very much a work in progress. We still
 // need to the following:
 //   -- add support for hdres
@@ -55,14 +133,14 @@
 //   -- use reconstructed pixels instead of source pixels for padding
 //   -- use chroma pixels in addition to luma pixels
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int bsize, int quad_tree_idx,
-                                  int *partition_none_allowed,
-                                  int *partition_horz_allowed,
-                                  int *partition_vert_allowed,
-                                  int *do_rectangular_split,
-                                  int *do_square_split) {
-  assert(cm->seq_params.sb_size >= BLOCK_64X64 &&
+                                  int quad_tree_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state) {
+  assert(cm->seq_params->sb_size >= BLOCK_64X64 &&
          "Invalid sb_size for intra_cnn!");
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
 
   if (bsize == BLOCK_128X128) {
@@ -73,7 +151,6 @@
 
   // Precompute the CNN part and cache the result in MACROBLOCK
   if (bsize == BLOCK_64X64 && !part_info->cnn_output_valid) {
-    aom_clear_system_state();
     const CNN_CONFIG *cnn_config = &av1_intra_mode_cnn_partition_cnn_config;
 
     // Prepare the output
@@ -148,7 +225,6 @@
 
   const NN_CONFIG *dnn_config = dnn_configs[bsize_idx];
 
-  aom_clear_system_state();
   float dnn_features[100];
   float logits[4] = { 0.0f };
 
@@ -218,7 +294,6 @@
 
   // Make decision
   av1_nn_predict(dnn_features, dnn_config, 1, logits);
-  aom_clear_system_state();
 
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -241,25 +316,28 @@
   }
 
   if (logits[0] > split_only_thresh) {
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_rectangular_split = 0;
+    // As screen contents tend to choose larger partitions, do not prune
+    // PARTITION_NONE when intra_cnn_based_part_prune_level=1.
+    if (intra_cnn_based_part_prune_level != 1) {
+      part_state->partition_none_allowed = 0;
+    }
+    part_state->do_square_split = 1;
+    av1_disable_rect_partitions(part_state);
   }
 
   if (logits[0] < no_split_thresh) {
-    *do_square_split = 0;
+    av1_disable_square_split_partition(part_state);
   }
 }
 
-void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_rectangular_split, int *do_square_split) {
-  aom_clear_system_state();
-
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state) {
   const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -275,6 +353,10 @@
       av1_simple_motion_search_split_nn_config[bsize_idx];
   const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
 
+  if (agg < 0) {
+    return;
+  }
+
   const float split_only_thresh =
       av1_simple_motion_search_split_thresh[agg][res_idx][bsize_idx];
   const float no_split_thresh =
@@ -284,6 +366,22 @@
   simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_SPLIT_MODEL_FLAG);
+
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_SPLIT, 0, bsize, mi_row, mi_col);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (ext_ml_model_decision_before_none(
+          cpi, features, &part_state->partition_none_allowed,
+          &part_state->partition_rect_allowed[HORZ],
+          &part_state->partition_rect_allowed[VERT],
+          &part_state->do_rectangular_split, &part_state->do_square_split)) {
+    return;
+  }
+
   for (int idx = 0; idx < FEATURE_SIZE_SMS_SPLIT; idx++) {
     features[idx] = (features[idx] - ml_mean[idx]) / ml_std[idx];
   }
@@ -291,18 +389,14 @@
   float score = 0.0f;
 
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   if (score > split_only_thresh) {
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_rectangular_split = 0;
+    av1_set_square_split_only(part_state);
   }
 
   if (cpi->sf.part_sf.simple_motion_search_split >= 2 &&
       score < no_split_thresh) {
-    *do_square_split = 0;
+    av1_disable_square_split_partition(part_state);
   }
 
   // If the score is very low, prune rectangular split since it is unlikely to
@@ -313,7 +407,9 @@
         scale * av1_simple_motion_search_no_split_thresh
                     [cpi->sf.part_sf.simple_motion_search_rect_split][res_idx]
                     [bsize_idx];
-    if (score < rect_split_thresh) *do_rectangular_split = 0;
+    if (score < rect_split_thresh) {
+      part_state->do_rectangular_split = 0;
+    }
   }
 }
 
@@ -356,7 +452,7 @@
       int_mv best_mv =
           av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
                                    start_mvs[ref], num_planes, use_subpixel);
-      curr_var = cpi->fn_ptr[bsize].vf(
+      curr_var = cpi->ppi->fn_ptr[bsize].vf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
           xd->plane[0].dst.stride, &curr_sse);
       if (curr_sse < *best_sse) {
@@ -402,6 +498,7 @@
   const int w_mi = mi_size_wide[bsize];
   const int h_mi = mi_size_high[bsize];
   assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+  assert(bsize >= BLOCK_8X8);
   assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
          cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
 
@@ -468,7 +565,6 @@
 
   if (!features) return;
 
-  aom_clear_system_state();
   int f_idx = 0;
   if (features_to_get & FEATURE_SMS_NONE_FLAG) {
     for (int sub_idx = 0; sub_idx < 2; sub_idx++) {
@@ -489,7 +585,6 @@
       features[f_idx++] = logf(1.0f + sms_tree->sms_rect_feat[sub_idx]);
     }
   }
-  aom_clear_system_state();
 
   const MACROBLOCKD *xd = &x->e_mbd;
   set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
@@ -511,12 +606,14 @@
   features[f_idx++] = (float)mi_size_high_log2[left_bsize];
 }
 
-void av1_simple_motion_search_prune_rect(
-    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, int partition_horz_allowed,
-    int partition_vert_allowed, int *prune_horz, int *prune_vert) {
-  aom_clear_system_state();
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state) {
   const AV1_COMMON *const cm = &cpi->common;
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const int bsize_idx = convert_bsize_to_idx(bsize);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
@@ -530,6 +627,11 @@
               *ml_std = av1_simple_motion_search_prune_rect_std[bsize_idx];
 
   const int agg = cpi->sf.part_sf.simple_motion_search_prune_agg;
+
+  if (agg < 0) {
+    return;
+  }
+
   const float prune_thresh =
       av1_simple_motion_search_prune_rect_thresh[agg][res_idx][bsize_idx];
 
@@ -543,6 +645,26 @@
   simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
                                            FEATURE_SMS_PRUNE_PART_FLAG);
+
+  // Note: it is intended to not normalize the features here, to keep it
+  // consistent for all features collected and passed to the external model.
+  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
+      !frame_is_intra_only(cm) &&
+      (part_state->partition_rect_allowed[HORZ] ||
+       part_state->partition_rect_allowed[VERT]) &&
+      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+    // Write features to file
+    write_features_to_file(
+        cpi->oxcf.partition_info_path, cpi->ext_part_controller.test_mode,
+        features, FEATURE_SIZE_SMS_PRUNE_PART, 1, bsize, mi_row, mi_col);
+
+    if (ext_ml_model_decision_before_none_part2(
+            cpi, features, &part_state->prune_rect_part[HORZ],
+            &part_state->prune_rect_part[VERT])) {
+      return;
+    }
+  }
+
   for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
     features[f_idx] = (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
   }
@@ -555,17 +677,15 @@
                               : EXT_PARTITION_TYPES;
 
   av1_nn_predict(features, nn_config, 1, scores);
-  aom_clear_system_state();
 
   av1_nn_softmax(scores, probs, num_classes);
 
   // Determine if we should prune rectangular partitions.
-  if (cpi->sf.part_sf.simple_motion_search_prune_rect &&
-      !frame_is_intra_only(cm) &&
-      (partition_horz_allowed || partition_vert_allowed) &&
-      bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
-    *prune_horz = probs[PARTITION_HORZ] <= prune_thresh;
-    *prune_vert = probs[PARTITION_VERT] <= prune_thresh;
+  if (probs[PARTITION_HORZ] <= prune_thresh) {
+    part_state->prune_rect_part[HORZ] = 1;
+  }
+  if (probs[PARTITION_VERT] <= prune_thresh) {
+    part_state->prune_rect_part[VERT] = 1;
   }
 }
 
@@ -577,10 +697,11 @@
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
 void av1_simple_motion_search_early_term_none(
     AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
-    int *early_terminate) {
-  // TODO(chiyotsai@google.com): There are other features we can extract from
-  // PARTITION_NONE. Play with this later.
+    const RD_STATS *none_rdc, PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   float features[FEATURE_SIZE_SMS_TERM_NONE] = { 0.0f };
   simple_motion_search_prune_part_features(cpi, x, sms_tree, mi_row, mi_col,
                                            bsize, features,
@@ -617,6 +738,16 @@
     assert(0 && "Unexpected block size in simple_motion_term_none");
   }
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         FEATURE_SIZE_SMS_TERM_NONE, 3, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none_part2(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
   if (ml_model) {
     float score = 0.0f;
     for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
@@ -626,7 +757,7 @@
     score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
 
     if (score >= 0.0f) {
-      *early_terminate = 1;
+      part_state->terminate_partition_search = 1;
     }
   }
 }
@@ -636,7 +767,7 @@
                                         float *features) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
 
   // Currently this only allows 128X128 SB size. May extend it to 64X64 SB size.
   assert(sb_size == BLOCK_128X128);
@@ -644,7 +775,6 @@
   int f_idx = 0;
 
   const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
-  aom_clear_system_state();
   const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
 
   // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
@@ -679,7 +809,6 @@
       int_mv best_mv = av1_simple_motion_sse_var(
           cpi, x, this_mi_row, this_mi_col, mb_size, start_mv, 0, &sse, &var);
 
-      aom_clear_system_state();
       const float mv_row = (float)(best_mv.as_mv.row / 8);
       const float mv_col = (float)(best_mv.as_mv.col / 8);
       const float log_sse = logf(1.0f + (float)sse);
@@ -701,7 +830,6 @@
       if (log_sse < min_log_sse) min_log_sse = log_sse;
       if (log_sse > max_log_sse) max_log_sse = log_sse;
     }
-  aom_clear_system_state();
   const int blks = mb_rows * mb_cols;
   const float avg_mv_row = sum_mv_row / (float)blks;
   const float var_mv_row =
@@ -751,7 +879,6 @@
   assert(cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
          NOT_IN_USE);
 
-  aom_clear_system_state();
   av1_nn_predict(features, nn_config, 1, scores);
 
   int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
@@ -782,7 +909,7 @@
     }
   } else if (cpi->sf.part_sf.auto_max_partition_based_on_simple_motion ==
              ADAPT_PRED) {
-    const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+    const BLOCK_SIZE sb_size = cpi->common.seq_params->sb_size;
     const MACROBLOCKD *const xd = &x->e_mbd;
     // TODO(debargha): x->source_variance is unavailable at this point,
     // so compute. The redundant recomputation later can be removed.
@@ -849,12 +976,16 @@
 #define FEATURES 31
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
                                    SIMPLE_MOTION_DATA_TREE *const sms_tree,
-                                   BLOCK_SIZE bsize, int64_t best_rd,
-                                   int64_t part_none_rd, int64_t part_split_rd,
-                                   int64_t *split_block_rd, int mi_row,
-                                   int mi_col,
-                                   int *const terminate_partition_search) {
-  if (best_rd <= 0 || best_rd == INT64_MAX || *terminate_partition_search)
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  if (best_rd <= 0 || best_rd == INT64_MAX ||
+      part_state->terminate_partition_search)
     return;
 
   const AV1_COMMON *const cm = &cpi->common;
@@ -895,8 +1026,6 @@
   int f_idx = 0;
   float features[FEATURES] = { 0.0f };
 
-  aom_clear_system_state();
-
   features[f_idx++] = logf(1.0f + (float)dc_q / 4.0f);
   features[f_idx++] = logf(1.0f + (float)best_rd / bs / bs / 1024.0f);
 
@@ -930,18 +1059,33 @@
 
   assert(f_idx == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         4, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split(
+          cpi, features, &part_state->terminate_partition_search)) {
+    return;
+  }
+
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
   // Score is indicator of confidence that we should NOT terminate.
-  if (score < thresh) *terminate_partition_search = 1;
+  if (score < thresh) {
+    part_state->terminate_partition_search = 1;
+  }
 }
 #undef FEATURES
 
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
                                  int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
-                                 int *const dst_prune_vert) {
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   best_rd = AOMMAX(best_rd, 1);
   const NN_CONFIG *nn_config = NULL;
@@ -971,7 +1115,6 @@
     default: assert(0 && "Unexpected bsize.");
   }
   if (!nn_config) return;
-  aom_clear_system_state();
 
   // 1. Compute input features
   float features[9];
@@ -1017,27 +1160,41 @@
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; i++)
     features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features,
+                         /*feature_size=*/9, 5, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_split_part2(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &part_state->prune_rect_part[HORZ],
+          &part_state->prune_rect_part[VERT])) {
+    return;
+  }
+
   // 2. Do the prediction and prune 0-2 partitions based on their probabilities
   float raw_scores[3] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, raw_scores);
-  aom_clear_system_state();
   float probs[3] = { 0.0f };
   av1_nn_softmax(raw_scores, probs, 3);
 
   // probs[0] is the probability of the fact that both rectangular partitions
   // are worse than current best_rd
-  if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
-  if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+  if (probs[1] <= cur_thresh) part_state->prune_rect_part[HORZ] = 1;
+  if (probs[2] <= cur_thresh) part_state->prune_rect_part[VERT] = 1;
 }
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(
-    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
-    int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
-    int *const horzb_partition_allowed, int *const verta_partition_allowed,
-    int *const vertb_partition_allowed) {
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
   if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
@@ -1050,8 +1207,6 @@
   }
   if (!nn_config) return;
 
-  aom_clear_system_state();
-
   // Generate features.
   float features[10];
   int feature_index = 0;
@@ -1061,16 +1216,19 @@
   int sub_block_rdcost[8] = { 0 };
   int rd_index = 0;
   for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *horz_rd = part_state->rect_part_rd[HORZ];
     if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)horz_rd[i];
     ++rd_index;
   }
   for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    const int64_t *vert_rd = part_state->rect_part_rd[VERT];
     if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)vert_rd[i];
     ++rd_index;
   }
   for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    const int64_t *split_rd = part_state->split_rd;
     if (split_rd[i] > 0 && split_rd[i] < 1000000000)
       sub_block_rdcost[rd_index] = (int)split_rd[i];
     ++rd_index;
@@ -1084,10 +1242,24 @@
   }
   assert(feature_index == 10);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           /*feature_size=*/10, 6, bsize, mi_row, mi_col);
+  }
+
+  if (ext_ml_model_decision_after_rect(
+          &cpi->ext_part_controller, frame_is_intra_only(&cpi->common),
+          features, &ab_partitions_allowed[HORZ_A],
+          &ab_partitions_allowed[HORZ_B], &ab_partitions_allowed[VERT_A],
+          &ab_partitions_allowed[VERT_B])) {
+    return;
+  }
+
   // Calculate scores using the NN model.
   float score[16] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
-  aom_clear_system_state();
   int int_score[16];
   int max_score = -1000;
   for (int i = 0; i < 16; ++i) {
@@ -1102,16 +1274,13 @@
     case BLOCK_32X32: thresh -= 100; break;
     default: break;
   }
-  *horza_partition_allowed = 0;
-  *horzb_partition_allowed = 0;
-  *verta_partition_allowed = 0;
-  *vertb_partition_allowed = 0;
+  av1_zero_array(ab_partitions_allowed, NUM_AB_PARTS);
   for (int i = 0; i < 16; ++i) {
     if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *horza_partition_allowed = 1;
-      if ((i >> 1) & 1) *horzb_partition_allowed = 1;
-      if ((i >> 2) & 1) *verta_partition_allowed = 1;
-      if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+      if ((i >> 0) & 1) ab_partitions_allowed[HORZ_A] = 1;
+      if ((i >> 1) & 1) ab_partitions_allowed[HORZ_B] = 1;
+      if ((i >> 2) & 1) ab_partitions_allowed[VERT_A] = 1;
+      if ((i >> 3) & 1) ab_partitions_allowed[VERT_B] = 1;
     }
   }
 }
@@ -1119,16 +1288,27 @@
 #define FEATURES 18
 #define LABELS 4
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    int part_ctx, int64_t best_rd,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
-    int *const partition_vert4_allowed, unsigned int pb_source_variance,
-    int mi_row, int mi_col) {
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance) {
+  const PartitionBlkParams blk_params = part_state->part_blk_params;
+  const int mi_row = blk_params.mi_row;
+  const int mi_col = blk_params.mi_col;
+  const int bsize = blk_params.bsize;
+
+  int64_t(*rect_part_rd)[SUB_PARTITIONS_RECT] = part_state->rect_part_rd;
+  int64_t *split_rd = part_state->split_rd;
+  if (ext_ml_model_decision_after_part_ab(
+          cpi, x, bsize, part_ctx, best_rd, rect_part_rd, split_rd,
+          &part4_allowed[HORZ4], &part4_allowed[VERT4], pb_source_variance,
+          mi_row, mi_col))
+    return;
+
   if (best_rd >= 1000000000) return;
-  int64_t *horz_rd = rect_part_rd[HORZ];
-  int64_t *vert_rd = rect_part_rd[VERT];
+  int64_t *horz_rd = rect_part_rd[HORZ4];
+  int64_t *vert_rd = rect_part_rd[VERT4];
   const NN_CONFIG *nn_config = NULL;
   switch (bsize) {
     case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
@@ -1138,8 +1318,6 @@
   }
   if (!nn_config) return;
 
-  aom_clear_system_state();
-
   // Generate features.
   float features[FEATURES];
   int feature_index = 0;
@@ -1225,10 +1403,16 @@
   }
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  if (!frame_is_intra_only(&cpi->common)) {
+    write_features_to_file(cpi->oxcf.partition_info_path,
+                           cpi->ext_part_controller.test_mode, features,
+                           FEATURES, 7, bsize, mi_row, mi_col);
+  }
+
   // Calculate scores using the NN model.
   float score[LABELS] = { 0.0f };
   av1_nn_predict(features, nn_config, 1, score);
-  aom_clear_system_state();
   int int_score[LABELS];
   int max_score = -1000;
   for (int i = 0; i < LABELS; ++i) {
@@ -1244,12 +1428,11 @@
     case BLOCK_64X64: thresh -= 200; break;
     default: break;
   }
-  *partition_horz4_allowed = 0;
-  *partition_vert4_allowed = 0;
+  av1_zero_array(part4_allowed, NUM_PART4_TYPES);
   for (int i = 0; i < LABELS; ++i) {
     if (int_score[i] >= thresh) {
-      if ((i >> 0) & 1) *partition_horz4_allowed = 1;
-      if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+      if ((i >> 0) & 1) part4_allowed[HORZ4] = 1;
+      if ((i >> 1) & 1) part4_allowed[VERT4] = 1;
     }
   }
 }
@@ -1257,10 +1440,14 @@
 #undef LABELS
 
 #define FEATURES 4
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance, int bit_depth) {
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state) {
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const int mi_row = blk_params->mi_row, mi_col = blk_params->mi_col;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
   const NN_CONFIG *nn_config = NULL;
   int thresh = 0;
   switch (bsize) {
@@ -1286,7 +1473,7 @@
       break;
     default: assert(0 && "Unexpected bsize.");
   }
-  if (!nn_config || thresh < 0) return 0;
+  if (!nn_config || thresh < 0) return;
 
   const float ml_predict_breakout_thresh_scale[3] = { 1.15f, 1.05f, 1.0f };
   thresh = (int)((float)thresh *
@@ -1296,7 +1483,6 @@
   // Generate feature values.
   float features[FEATURES];
   int feature_index = 0;
-  aom_clear_system_state();
 
   const int num_pels_log2 = num_pels_log2_lookup[bsize];
   float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
@@ -1314,27 +1500,53 @@
   features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
   assert(feature_index == FEATURES);
 
+  // Write features to file
+  write_features_to_file(cpi->oxcf.partition_info_path,
+                         cpi->ext_part_controller.test_mode, features, FEATURES,
+                         2, bsize, mi_row, mi_col);
+
+  if (ext_ml_model_decision_after_none(&cpi->ext_part_controller,
+                                       frame_is_intra_only(&cpi->common),
+                                       features, &part_state->do_square_split,
+                                       &part_state->do_rectangular_split)) {
+    return;
+  }
+
   // Calculate score using the NN model.
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   // Make decision.
-  return (int)(score * 100) >= thresh;
+  if ((int)(score * 100) >= thresh) {
+    part_state->do_square_split = 0;
+    part_state->do_rectangular_split = 0;
+  }
 }
 #undef FEATURES
 
-void av1_prune_partitions_before_search(
-    AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, SIMPLE_MOTION_DATA_TREE *const sms_tree,
-    int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_rectangular_split,
-    int *do_square_split, int *prune_horz, int *prune_vert) {
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state) {
   const AV1_COMMON *const cm = &cpi->common;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
 
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
+
+  // Prune rectangular partitions for larger blocks.
+  if (bsize > cpi->sf.part_sf.rect_partition_eval_thresh) {
+    part_state->do_rectangular_split = 0;
+    part_state->partition_rect_allowed[HORZ] = 0;
+    part_state->partition_rect_allowed[VERT] = 0;
+  }
+
   // Prune rectangular, AB and 4-way partition based on q index and block size
-  if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx) {
+  if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 1) {
+    if (bsize == BLOCK_8X8 && x->qindex < 35)
+      av1_disable_rect_partitions(part_state);
+
+  } else if (cpi->sf.part_sf.prune_rectangular_split_based_on_qidx == 2) {
     // Enumeration difference between two square partitions
     const int sqr_bsize_step = BLOCK_32X32 - BLOCK_16X16;
     int max_bsize =
@@ -1348,9 +1560,7 @@
     // qidx 86 to 170: prune bsize below BLOCK_16X16
     // qidx 171 to 255: prune bsize below BLOCK_8X8
     if (bsize < max_prune_bsize) {
-      *do_rectangular_split = 0;
-      *partition_horz_allowed = 0;
-      *partition_vert_allowed = 0;
+      av1_disable_rect_partitions(part_state);
     }
   }
 
@@ -1369,63 +1579,60 @@
       }
     }
     if (prune_sub_8x8) {
-      *partition_horz_allowed = 0;
-      *partition_vert_allowed = 0;
-      *do_square_split = 0;
+      av1_disable_all_splits(part_state);
     }
   }
 
   // A CNN-based speed feature pruning out either split or all non-split
   // partition in INTRA frame coding.
-  const int try_intra_cnn_split =
-      !cpi->use_screen_content_tools && frame_is_intra_only(cm) &&
-      cpi->sf.part_sf.intra_cnn_split &&
-      cm->seq_params.sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols;
+  const int try_intra_cnn_based_part_prune =
+      frame_is_intra_only(cm) &&
+      cpi->sf.part_sf.intra_cnn_based_part_prune_level &&
+      cm->seq_params->sb_size >= BLOCK_64X64 && bsize <= BLOCK_64X64 &&
+      blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params);
 
-  if (try_intra_cnn_split) {
+  if (try_intra_cnn_based_part_prune) {
     av1_intra_mode_cnn_partition(
-        &cpi->common, x, bsize, x->part_search_info.quad_tree_idx,
-        partition_none_allowed, partition_horz_allowed, partition_vert_allowed,
-        do_rectangular_split, do_square_split);
+        &cpi->common, x, x->part_search_info.quad_tree_idx,
+        cpi->sf.part_sf.intra_cnn_based_part_prune_level, part_state);
   }
 
   // Use simple motion search to prune out split or non-split partitions. This
   // must be done prior to PARTITION_SPLIT to propagate the initial mvs to a
   // smaller blocksize.
   const int try_split_only =
-      !cpi->use_screen_content_tools &&
-      cpi->sf.part_sf.simple_motion_search_split && *do_square_split &&
-      bsize >= BLOCK_8X8 &&
-      mi_row + mi_size_high[bsize] <= mi_params->mi_rows &&
-      mi_col + mi_size_wide[bsize] <= mi_params->mi_cols &&
+      cpi->sf.part_sf.simple_motion_search_split &&
+      part_state->do_square_split && blk_params->bsize_at_least_8x8 &&
+      av1_is_whole_blk_in_frame(blk_params, mi_params) &&
       !frame_is_intra_only(cm) && !av1_superres_scaled(cm);
 
   if (try_split_only) {
-    av1_simple_motion_search_based_split(
-        cpi, x, sms_tree, mi_row, mi_col, bsize, partition_none_allowed,
-        partition_horz_allowed, partition_vert_allowed, do_rectangular_split,
-        do_square_split);
+    av1_simple_motion_search_based_split(cpi, x, sms_tree, part_state);
   }
 
   // Use simple motion search to prune out rectangular partition in some
   // direction. The results are stored in prune_horz and prune_vert in order to
   // bypass future related pruning checks if a pruning decision has been made.
-  const int try_prune_rect =
-      !cpi->use_screen_content_tools &&
-      cpi->sf.part_sf.simple_motion_search_prune_rect &&
-      !frame_is_intra_only(cm) && *do_rectangular_split &&
-      (*do_square_split || *partition_none_allowed ||
-       (*prune_horz && *prune_vert)) &&
-      (*partition_horz_allowed || *partition_vert_allowed) &&
-      bsize >= BLOCK_8X8;
+
+  // We want to search at least one partition mode, so don't prune if NONE and
+  // SPLIT are disabled.
+  const int non_rect_part_allowed =
+      part_state->do_square_split || part_state->partition_none_allowed;
+  // Only run the model if the partitions are not already pruned.
+  const int rect_part_allowed = part_state->do_rectangular_split &&
+                                ((part_state->partition_rect_allowed[HORZ] &&
+                                  !part_state->prune_rect_part[HORZ]) ||
+                                 (part_state->partition_rect_allowed[VERT] &&
+                                  !part_state->prune_rect_part[VERT]));
+
+  const int try_prune_rect = cpi->sf.part_sf.simple_motion_search_prune_rect &&
+                             !frame_is_intra_only(cm) &&
+                             non_rect_part_allowed && rect_part_allowed &&
+                             !av1_superres_scaled(cm);
 
   if (try_prune_rect) {
-    av1_simple_motion_search_prune_rect(
-        cpi, x, sms_tree, mi_row, mi_col, bsize, *partition_horz_allowed,
-        *partition_vert_allowed, prune_horz, prune_vert);
+    av1_simple_motion_search_prune_rect(cpi, x, sms_tree, part_state);
   }
 }
 
@@ -1435,13 +1642,13 @@
 }
 #endif  // NDEBUG
 
-void av1_prune_partitions_by_max_min_bsize(
-    SuperBlockEnc *sb_enc, BLOCK_SIZE bsize, int is_not_edge_block,
-    int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_square_split) {
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state) {
   assert(is_bsize_square(sb_enc->max_partition_size));
   assert(is_bsize_square(sb_enc->min_partition_size));
   assert(sb_enc->min_partition_size <= sb_enc->max_partition_size);
+  const PartitionBlkParams *blk_params = &part_state->part_blk_params;
+  const BLOCK_SIZE bsize = blk_params->bsize;
   assert(is_bsize_square(bsize));
   const int max_partition_size_1d = block_size_wide[sb_enc->max_partition_size];
   const int min_partition_size_1d = block_size_wide[sb_enc->min_partition_size];
@@ -1451,19 +1658,18 @@
   const int is_gt_max_sq_part = bsize_1d > max_partition_size_1d;
   if (is_gt_max_sq_part) {
     // If current block size is larger than max, only allow split.
-    *partition_none_allowed = 0;
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
-    *do_square_split = 1;
+    av1_set_square_split_only(part_state);
   } else if (is_le_min_sq_part) {
     // If current block size is less or equal to min, only allow none if valid
     // block large enough; only allow split otherwise.
-    *partition_horz_allowed = 0;
-    *partition_vert_allowed = 0;
+    av1_disable_rect_partitions(part_state);
+
     // only disable square split when current block is not at the picture
     // boundary. otherwise, inherit the square split flag from previous logic
-    if (is_not_edge_block) *do_square_split = 0;
-    *partition_none_allowed = !(*do_square_split);
+    if (av1_blk_has_rows_and_cols(blk_params)) {
+      part_state->do_square_split = 0;
+    }
+    part_state->partition_none_allowed = !(part_state->do_square_split);
   }
 }
 
@@ -1501,24 +1707,25 @@
   return 1;
 }
 
-void av1_prune_ab_partitions(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
-    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT],
-    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
-    int partition_horz_allowed, int partition_vert_allowed,
-    int *horza_partition_allowed, int *horzb_partition_allowed,
-    int *verta_partition_allowed, int *vertb_partition_allowed) {
-  int64_t *horz_rd = rect_part_rd[HORZ];
-  int64_t *vert_rd = rect_part_rd[VERT];
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed) {
+  int64_t *horz_rd = part_state->rect_part_rd[HORZ];
+  int64_t *vert_rd = part_state->rect_part_rd[VERT];
+  int64_t *split_rd = part_state->split_rd;
   const PartitionCfg *const part_cfg = &cpi->oxcf.part_cfg;
   // The standard AB partitions are allowed initially if ext-partition-types are
   // allowed.
-  int horzab_partition_allowed =
-      ext_partition_allowed & part_cfg->enable_ab_partitions;
-  int vertab_partition_allowed =
-      ext_partition_allowed & part_cfg->enable_ab_partitions;
+  int horzab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[HORZ];
+  int vertab_partition_allowed = ext_partition_allowed &&
+                                 part_cfg->enable_ab_partitions &&
+                                 part_state->partition_rect_allowed[VERT];
 
   // Pruning: pruning out AB partitions on one main direction based on the
   // current best partition and source variance.
@@ -1553,20 +1760,20 @@
   // Pruning: pruning out horz_a or horz_b if the combined rdcost of its
   // subblocks estimated from previous partitions is much higher than the best
   // rd so far.
-  *horza_partition_allowed = horzab_partition_allowed;
-  *horzb_partition_allowed = horzab_partition_allowed;
+  ab_partitions_allowed[HORZ_A] = horzab_partition_allowed;
+  ab_partitions_allowed[HORZ_B] = horzab_partition_allowed;
   if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
     const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
     const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
     switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
       case 1:
-        *horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdcost);
-        *horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 14 < best_rdcost);
         break;
       case 2:
       default:
-        *horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdcost);
-        *horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[HORZ_A] &= (horz_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[HORZ_B] &= (horz_b_rd / 16 * 15 < best_rdcost);
         break;
     }
   }
@@ -1574,20 +1781,20 @@
   // Pruning: pruning out vert_a or vert_b if the combined rdcost of its
   // subblocks estimated from previous partitions is much higher than the best
   // rd so far.
-  *verta_partition_allowed = vertab_partition_allowed;
-  *vertb_partition_allowed = vertab_partition_allowed;
+  ab_partitions_allowed[VERT_A] = vertab_partition_allowed;
+  ab_partitions_allowed[VERT_B] = vertab_partition_allowed;
   if (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
     const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
     const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
     switch (cpi->sf.part_sf.prune_ext_partition_types_search_level) {
       case 1:
-        *verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdcost);
-        *vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 14 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 14 < best_rdcost);
         break;
       case 2:
       default:
-        *verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdcost);
-        *vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[VERT_A] &= (vert_a_rd / 16 * 15 < best_rdcost);
+        ab_partitions_allowed[VERT_B] &= (vert_b_rd / 16 * 15 < best_rdcost);
         break;
     }
   }
@@ -1595,45 +1802,615 @@
   // Pruning: pruning out some ab partitions using a DNN taking rd costs of
   // sub-blocks from previous basic partition types.
   if (cpi->sf.part_sf.ml_prune_partition && ext_partition_allowed &&
-      partition_horz_allowed && partition_vert_allowed) {
+      part_state->partition_rect_allowed[HORZ] &&
+      part_state->partition_rect_allowed[VERT]) {
     // TODO(huisu@google.com): x->source_variance may not be the current
     // block's variance. The correct one to use is pb_source_variance. Need to
     // re-train the model to fix it.
-    av1_ml_prune_ab_partition(bsize, pc_tree->partitioning,
+    av1_ml_prune_ab_partition(cpi, pc_tree->partitioning,
                               get_unsigned_bits(x->source_variance),
-                              best_rdcost, horz_rd, vert_rd, split_rd,
-                              horza_partition_allowed, horzb_partition_allowed,
-                              verta_partition_allowed, vertb_partition_allowed);
+                              best_rdcost, part_state, ab_partitions_allowed);
   }
 
-  // Disable ab partitions if they are disabled by the encoder parameter.
-  *horza_partition_allowed &= part_cfg->enable_ab_partitions;
-  *horzb_partition_allowed &= part_cfg->enable_ab_partitions;
-  *verta_partition_allowed &= part_cfg->enable_ab_partitions;
-  *vertb_partition_allowed &= part_cfg->enable_ab_partitions;
-
   // Pruning: pruning AB partitions based on the number of horz/vert wins
   // in the current block and sub-blocks in PARTITION_SPLIT.
   if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
-      *horza_partition_allowed) {
-    *horza_partition_allowed &= evaluate_ab_partition_based_on_split(
+      ab_partitions_allowed[HORZ_A]) {
+    ab_partitions_allowed[HORZ_A] &= evaluate_ab_partition_based_on_split(
         pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 0, 1);
   }
   if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
-      *horzb_partition_allowed) {
-    *horzb_partition_allowed &= evaluate_ab_partition_based_on_split(
+      ab_partitions_allowed[HORZ_B]) {
+    ab_partitions_allowed[HORZ_B] &= evaluate_ab_partition_based_on_split(
         pc_tree, PARTITION_HORZ, rect_part_win_info, x->qindex, 2, 3);
   }
   if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
-      *verta_partition_allowed) {
-    *verta_partition_allowed &= evaluate_ab_partition_based_on_split(
+      ab_partitions_allowed[VERT_A]) {
+    ab_partitions_allowed[VERT_A] &= evaluate_ab_partition_based_on_split(
         pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 0, 2);
   }
   if (cpi->sf.part_sf.prune_ext_part_using_split_info >= 2 &&
-      *vertb_partition_allowed) {
-    *vertb_partition_allowed &= evaluate_ab_partition_based_on_split(
+      ab_partitions_allowed[VERT_B]) {
+    ab_partitions_allowed[VERT_B] &= evaluate_ab_partition_based_on_split(
         pc_tree, PARTITION_VERT, rect_part_win_info, x->qindex, 1, 3);
   }
 }
 
+// Prepare features for the external model. Specifically, features after
+// ab partition is searched.
+static void prepare_features_after_part_ab(
+    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
+    int part_ctx, int64_t best_rd,
+    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], unsigned int pb_source_variance,
+    int mi_row, int mi_col, aom_partition_features_t *const features) {
+  int64_t *horz_rd = rect_part_rd[HORZ];
+  int64_t *vert_rd = rect_part_rd[VERT];
+
+  // Generate features.
+  int feature_index = 0;
+  features->after_part_ab.f[feature_index++] = (float)part_ctx;
+  features->after_part_ab.f[feature_index++] =
+      (float)get_unsigned_bits(pb_source_variance);
+
+  const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+  int sub_block_rdcost[8] = { 0 };
+  int rd_index = 0;
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)horz_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_RECT; ++i) {
+    if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)vert_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_SPLIT; ++i) {
+    if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+      sub_block_rdcost[rd_index] = (int)split_rd[i];
+    ++rd_index;
+  }
+  for (int i = 0; i < 8; ++i) {
+    // Ratio between the sub-block RD and the whole-block RD.
+    float rd_ratio = 1.0f;
+    if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+      rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+    features->after_part_ab.f[feature_index++] = rd_ratio;
+  }
+
+  // Get variance of the 1:4 and 4:1 sub-blocks.
+  unsigned int horz_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  unsigned int vert_4_source_var[SUB_PARTITIONS_PART4] = { 0 };
+  {
+    BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+    BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+    av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+                         av1_num_planes(&cpi->common), bsize);
+    const int src_stride = x->plane[0].src.stride;
+    uint8_t *src = x->plane[0].src.buf;
+    const MACROBLOCKD *const xd = &x->e_mbd;
+
+    struct buf_2d horz_4_src, vert_4_src;
+    horz_4_src.stride = src_stride;
+    vert_4_src.stride = src_stride;
+
+    for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+      horz_4_src.buf = src + i * block_size_high[horz_4_bs] * src_stride;
+      vert_4_src.buf = src + i * block_size_wide[vert_4_bs];
+
+      if (is_cur_buf_hbd(xd)) {
+        horz_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &horz_4_src, horz_4_bs, xd->bd);
+        vert_4_source_var[i] = av1_high_get_sby_perpixel_variance(
+            cpi, &vert_4_src, vert_4_bs, xd->bd);
+      } else {
+        horz_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &horz_4_src, horz_4_bs);
+        vert_4_source_var[i] =
+            av1_get_sby_perpixel_variance(cpi, &vert_4_src, vert_4_bs);
+      }
+    }
+  }
+
+  const float denom = (float)(pb_source_variance + 1);
+  const float low_b = 0.1f;
+  const float high_b = 10.0f;
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 4:1 sub-block variance and the whole-block variance.
+    float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  for (int i = 0; i < SUB_PARTITIONS_PART4; ++i) {
+    // Ratio between the 1:4 sub-block RD and the whole-block RD.
+    float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+    if (var_ratio < low_b) var_ratio = low_b;
+    if (var_ratio > high_b) var_ratio = high_b;
+    features->after_part_ab.f[feature_index++] = var_ratio;
+  }
+  assert(feature_index == 18);
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// partition_none_allowed
+// partition_horz_allowed
+// partition_vert_allowed
+// do_rectangular_split
+// do_square_split
+static bool ext_ml_model_decision_before_none(
+    AV1_COMP *cpi, const float features_from_motion[FEATURE_SIZE_SMS_SPLIT],
+    int *partition_none_allowed, int *partition_horz_allowed,
+    int *partition_vert_allowed, int *do_rectangular_split,
+    int *do_square_split) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE;
+  for (int i = 0; i < FEATURE_SIZE_SMS_SPLIT; ++i) {
+    features.before_part_none.f[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *partition_none_allowed = decision.partition_none_allowed;
+  *partition_horz_allowed = decision.partition_rect_allowed[HORZ];
+  *partition_vert_allowed = decision.partition_rect_allowed[VERT];
+  *do_rectangular_split = decision.do_rectangular_split;
+  *do_square_split = decision.do_square_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions before partition none. Specifically, these parameters:
+// prune_horz
+// prune_vert
+static bool ext_ml_model_decision_before_none_part2(
+    AV1_COMP *cpi,
+    const float features_from_motion[FEATURE_SIZE_SMS_PRUNE_PART],
+    int *prune_horz, int *prune_vert) {
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_PRUNE_PART; ++i) {
+    features.before_part_none.f_part2[i] = features_from_motion[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_horz = decision.prune_rect_part[HORZ];
+  *prune_vert = decision.prune_rect_part[VERT];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// do_square_split
+// do_rectangular_split
+bool ext_ml_model_decision_after_none(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_none, int *do_square_split,
+    int *do_rectangular_split) {
+  if (!ext_part_controller->ready || is_intra_frame) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE;
+  for (int i = 0; i < 4; ++i) {
+    features.after_part_none.f[i] = features_after_none[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *do_square_split = decision.do_square_split;
+  *do_rectangular_split = decision.do_rectangular_split;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_none_part2(
+    AV1_COMP *const cpi, const float *const features_terminate,
+    int *terminate_partition_search) {
+  AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (!ext_part_controller->ready || frame_is_intra_only(cm)) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_NONE_PART2;
+  for (int i = 0; i < FEATURE_SIZE_SMS_TERM_NONE; ++i) {
+    features.after_part_none.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// terminate_partition_search
+bool ext_ml_model_decision_after_split(AV1_COMP *const cpi,
+                                       const float *const features_terminate,
+                                       int *terminate_partition_search) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+  if (frame_is_intra_only(cm) || !cpi->ext_part_controller.ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT;
+  for (int i = 0; i < 31; ++i) {
+    features.after_part_split.f_terminate[i] = features_terminate[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *terminate_partition_search = decision.terminate_partition_search;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after none partition. Specifically, these parameters:
+// prune_rect_part[HORZ]
+// prune_rect_part[VERT]
+bool ext_ml_model_decision_after_split_part2(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_prune, int *prune_rect_part_horz,
+    int *prune_rect_part_vert) {
+  if (is_intra_frame || !ext_part_controller->ready) {
+    return false;
+  }
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2;
+  for (int i = 0; i < 9; ++i) {
+    features.after_part_split.f_prune_rect[i] = features_prune[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *prune_rect_part_horz = decision.prune_rect_part[0];
+  *prune_rect_part_vert = decision.prune_rect_part[1];
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after rectangular partition. Specifically, these parameters:
+// horza_partition_allowed
+// horzb_partition_allowed
+// verta_partition_allowed
+// vertb_partition_allowed
+static bool ext_ml_model_decision_after_rect(
+    ExtPartController *const ext_part_controller, const int is_intra_frame,
+    const float *const features_after_rect, int *horza_partition_allowed,
+    int *horzb_partition_allowed, int *verta_partition_allowed,
+    int *vertb_partition_allowed) {
+  if (is_intra_frame || !ext_part_controller->ready) return false;
+
+  // Setup features.
+  aom_partition_features_t features;
+  features.id = AOM_EXT_PART_FEATURE_AFTER_RECT;
+  for (int i = 0; i < 10; ++i) {
+    features.after_part_rect.f[i] = features_after_rect[i];
+  }
+
+  // Send necessary features to the external model.
+  av1_ext_part_send_features(ext_part_controller, &features);
+
+  // Get partition decisions from the external model.
+  aom_partition_decision_t decision;
+  const bool valid_decision =
+      av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+  if (!valid_decision) return false;
+
+  // Populate decisions
+  *horza_partition_allowed = decision.horza_partition_allowed;
+  *horzb_partition_allowed = decision.horzb_partition_allowed;
+  *verta_partition_allowed = decision.verta_partition_allowed;
+  *vertb_partition_allowed = decision.vertb_partition_allowed;
+
+  return true;
+}
+
+// If the external partition model is used, we let it determine partition
+// decisions after AB partition. Specifically, these parameters:
+// partition_vert4_allowed
+// partition_horz4_allowed
+static bool ext_ml_model_decision_after_part_ab(
+    AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int part_ctx,
+    int64_t best_rd, int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
+    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
+    int *const partition_vert4_allowed, unsigned int pb_source_variance,
+    int mi_row, int mi_col) {
+  const AV1_COMMON *const cm = &cpi->common;
+  ExtPartController *const ext_part_controller = &cpi->ext_part_controller;
+
+  if (!frame_is_intra_only(cm) && ext_part_controller->ready) {
+    // Setup features.
+    aom_partition_features_t features;
+    features.id = AOM_EXT_PART_FEATURE_AFTER_AB;
+    prepare_features_after_part_ab(cpi, x, bsize, part_ctx, best_rd,
+                                   rect_part_rd, split_rd, pb_source_variance,
+                                   mi_row, mi_col, &features);
+
+    // Send necessary features to the external model.
+    av1_ext_part_send_features(ext_part_controller, &features);
+
+    // Get partition decisions from the external model.
+    aom_partition_decision_t decision;
+    const bool valid_decision =
+        av1_ext_part_get_partition_decision(ext_part_controller, &decision);
+    if (!valid_decision) return false;
+
+    // Populate decisions
+    *partition_horz4_allowed = decision.partition_horz4_allowed;
+    *partition_vert4_allowed = decision.partition_vert4_allowed;
+
+    return true;
+  }
+
+  return false;
+}
+
+// This function resembles "av1_setup_sms_tree()" in context_tree.c
+// with function signature change.
+static SIMPLE_MOTION_DATA_TREE *setup_sms_tree(
+    AV1_COMP *const cpi, SIMPLE_MOTION_DATA_TREE *sms_tree) {
+  AV1_COMMON *const cm = &cpi->common;
+  const int stat_generation_stage = is_stat_generation_stage(cpi);
+  const int is_sb_size_128 = cm->seq_params->sb_size == BLOCK_128X128;
+  const int tree_nodes =
+      av1_get_pc_tree_nodes(is_sb_size_128, stat_generation_stage);
+  int sms_tree_index = 0;
+  SIMPLE_MOTION_DATA_TREE *this_sms;
+  int square_index = 1;
+  int nodes;
+
+  aom_free(sms_tree);
+  CHECK_MEM_ERROR(cm, sms_tree, aom_calloc(tree_nodes, sizeof(*sms_tree)));
+  this_sms = &sms_tree[0];
+
+  if (!stat_generation_stage) {
+    const int leaf_factor = is_sb_size_128 ? 4 : 1;
+    const int leaf_nodes = 256 * leaf_factor;
+
+    // Sets up all the leaf nodes in the tree.
+    for (sms_tree_index = 0; sms_tree_index < leaf_nodes; ++sms_tree_index) {
+      SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+      tree->block_size = square[0];
+    }
+
+    // Each node has 4 leaf nodes, fill each block_size level of the tree
+    // from leafs to the root.
+    for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+      for (int i = 0; i < nodes; ++i) {
+        SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+        tree->block_size = square[square_index];
+        for (int j = 0; j < 4; j++) tree->split[j] = this_sms++;
+        ++sms_tree_index;
+      }
+      ++square_index;
+    }
+  } else {
+    // Allocation for firstpass/LAP stage
+    // TODO(Mufaddal): refactor square_index to use a common block_size macro
+    // from firstpass.c
+    SIMPLE_MOTION_DATA_TREE *const tree = &sms_tree[sms_tree_index];
+    square_index = 2;
+    tree->block_size = square[square_index];
+  }
+
+  // Set up the root node for the largest superblock size
+  return &sms_tree[tree_nodes - 1];
+}
+
+static void write_motion_feature_to_file(
+    const char *const path, const int sb_counter, const unsigned int *block_sse,
+    const unsigned int *block_var, const int num_blocks, const BLOCK_SIZE bsize,
+    const BLOCK_SIZE fixed_block_size, const int mi_row, const int mi_col) {
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s/motion_search_feature_sb%d", path,
+           sb_counter);
+  FILE *pfile = fopen(filename, "w");
+  fprintf(pfile, "%d,%d,%d,%d,%d\n", mi_row, mi_col, bsize,
+          block_size_wide[fixed_block_size], num_blocks);
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_sse[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  for (int i = 0; i < num_blocks; ++i) {
+    fprintf(pfile, "%d", block_var[i]);
+    if (i < num_blocks - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (frame_is_intra_only(cm)) return;
+
+  MACROBLOCK *const x = &td->mb;
+  const BLOCK_SIZE fixed_block_size = BLOCK_16X16;
+  const int col_step = mi_size_wide[fixed_block_size];
+  const int row_step = mi_size_high[fixed_block_size];
+  SIMPLE_MOTION_DATA_TREE *sms_tree = NULL;
+  SIMPLE_MOTION_DATA_TREE *sms_root = setup_sms_tree(cpi, sms_tree);
+  TileInfo *const tile_info = &tile_data->tile_info;
+  av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col, bsize);
+  av1_init_simple_motion_search_mvs_for_sb(cpi, NULL, x, sms_root, mi_row,
+                                           mi_col);
+  av1_reset_simple_motion_tree_partition(sms_root, bsize);
+  const int ref_list[] = { cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME
+                                                        : LAST_FRAME };
+  const int mi_width =
+      AOMMIN(mi_size_wide[bsize], cm->mi_params.mi_cols - mi_col);
+  const int mi_height =
+      AOMMIN(mi_size_high[bsize], cm->mi_params.mi_rows - mi_row);
+  const int col_steps = (mi_width / col_step) + ((mi_width % col_step) > 0);
+  const int row_steps = (mi_height / row_step) + ((mi_height % row_step) > 0);
+  const int num_blocks = col_steps * row_steps;
+  unsigned int *block_sse = aom_calloc(num_blocks, sizeof(*block_sse));
+  unsigned int *block_var = aom_calloc(num_blocks, sizeof(*block_var));
+  int idx = 0;
+
+  for (int row = mi_row;
+       row < AOMMIN(mi_row + mi_size_high[bsize], cm->mi_params.mi_rows);
+       row += row_step) {
+    for (int col = mi_col;
+         col < AOMMIN(mi_col + mi_size_wide[bsize], cm->mi_params.mi_cols);
+         col += col_step) {
+      simple_motion_search_get_best_ref(
+          cpi, x, sms_root, row, col, fixed_block_size, ref_list,
+          /*num_refs=*/1, /*use_subpixel=*/1,
+          /*save_mv=*/1, &block_sse[idx], &block_var[idx]);
+      ++idx;
+    }
+  }
+  if (features == NULL) {
+    write_motion_feature_to_file(cpi->oxcf.partition_info_path, cpi->sb_counter,
+                                 block_sse, block_var, idx, bsize,
+                                 fixed_block_size, mi_row, mi_col);
+  } else {
+    features->sb_features.motion_features.unit_length =
+        block_size_wide[fixed_block_size];
+    features->sb_features.motion_features.num_units = idx;
+    for (int i = 0; i < idx; ++i) {
+      features->sb_features.motion_features.block_sse[i] = block_sse[i];
+      features->sb_features.motion_features.block_var[i] = block_var[i];
+    }
+  }
+
+  aom_free(block_sse);
+  aom_free(block_var);
+  aom_free(sms_tree);
+  if (sms_tree != NULL) {
+    aom_free(sms_tree);
+    sms_tree = NULL;
+  }
+}
+
 #endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void init_simple_motion_search_mvs(
+    SIMPLE_MOTION_DATA_TREE *sms_tree, const FULLPEL_MV *start_mvs) {
+  memcpy(sms_tree->start_mvs, start_mvs, sizeof(sms_tree->start_mvs));
+  av1_zero(sms_tree->sms_none_feat);
+  av1_zero(sms_tree->sms_rect_feat);
+  av1_zero(sms_tree->sms_none_valid);
+  av1_zero(sms_tree->sms_rect_valid);
+
+  if (sms_tree->block_size >= BLOCK_8X8) {
+    init_simple_motion_search_mvs(sms_tree->split[0], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[1], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[2], start_mvs);
+    init_simple_motion_search_mvs(sms_tree->split[3], start_mvs);
+  }
+}
+
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col) {
+  // Use the NEARESTMV of the sb as the start mv
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  FULLPEL_MV ref_mvs[REF_FRAMES];
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  av1_zero(ref_mvs);
+  // If tile_info is NULL, assume that the offsets have already been set.
+  if (tile_info) {
+    av1_set_offsets_without_segment_id(cpi, tile_info, x, mi_row, mi_col,
+                                       sb_size);
+  }
+
+  MB_MODE_INFO_EXT mbmi_ext;
+  const int ref_frame =
+      cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+  av1_find_mv_refs(cm, xd, xd->mi[0], ref_frame, mbmi_ext.ref_mv_count,
+                   xd->ref_mv_stack, xd->weight, NULL, mbmi_ext.global_mvs,
+                   mbmi_ext.mode_context);
+  if (mbmi_ext.ref_mv_count[ref_frame] > 0) {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&xd->ref_mv_stack[ref_frame][0].this_mv.as_mv);
+  } else {
+    ref_mvs[ref_frame] =
+        get_fullmv_from_mv(&mbmi_ext.global_mvs[ref_frame].as_mv);
+  }
+
+  init_simple_motion_search_mvs(sms_root, ref_mvs);
+}

diff --git a/av1/encoder/partition_strategy.h b/av1/encoder/partition_strategy.h
index ee890bb..f7daf37 100644
--- a/av1/encoder/partition_strategy.h
+++ b/av1/encoder/partition_strategy.h

@@ -13,82 +13,28 @@
 #define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
 
 #include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodeframe_utils.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encoder.h"
 
-#define FEATURE_SIZE_SMS_SPLIT_FAST 6
-#define FEATURE_SIZE_SMS_SPLIT 17
-#define FEATURE_SIZE_SMS_PRUNE_PART 25
-#define FEATURE_SIZE_SMS_TERM_NONE 28
-#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
-#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
-#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
-
-#define FEATURE_SMS_NONE_FLAG 1
-#define FEATURE_SMS_SPLIT_FLAG (1 << 1)
-#define FEATURE_SMS_RECT_FLAG (1 << 2)
-
-#define FEATURE_SMS_PRUNE_PART_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG | FEATURE_SMS_RECT_FLAG)
-#define FEATURE_SMS_SPLIT_MODEL_FLAG \
-  (FEATURE_SMS_NONE_FLAG | FEATURE_SMS_SPLIT_FLAG)
-
-// Number of sub-partitions in rectangular partition types.
-#define SUB_PARTITIONS_RECT 2
-
-// Number of sub-partitions in split partition type.
-#define SUB_PARTITIONS_SPLIT 4
-
-// Number of sub-partitions in AB partition types.
-#define SUB_PARTITIONS_AB 3
-
-// Number of sub-partitions in 4-way partition types.
-#define SUB_PARTITIONS_PART4 4
-
-// 4part parition types.
-enum { HORZ4 = 0, VERT4, NUM_PART4_TYPES } UENUM1BYTE(PART4_TYPES);
-
-// AB parition types.
-enum {
-  HORZ_A = 0,
-  HORZ_B,
-  VERT_A,
-  VERT_B,
-  NUM_AB_PARTS
-} UENUM1BYTE(AB_PART_TYPE);
-
-// Rectangular parition types.
-enum { HORZ = 0, VERT, NUM_RECT_PARTS } UENUM1BYTE(RECT_PART_TYPE);
-
-// Structure to keep win flags for HORZ and VERT partition evaluations.
-typedef struct {
-  int rect_part_win[NUM_RECT_PARTS];
-} RD_RECT_PART_WIN_INFO;
-
 void av1_intra_mode_cnn_partition(const AV1_COMMON *const cm, MACROBLOCK *x,
-                                  int bsize, int label_idx,
-                                  int *partition_none_allowed,
-                                  int *partition_horz_allowed,
-                                  int *partition_vert_allowed,
-                                  int *do_rectangular_split,
-                                  int *do_square_split);
+                                  int label_idx,
+                                  int intra_cnn_based_part_prune_level,
+                                  PartitionSearchState *part_state);
 
 // Performs a simple_motion_search with a single reference frame and extract
 // the variance of residues. Then use the features to determine whether we want
 // to go straight to splitting without trying PARTITION_NONE
-void av1_simple_motion_search_based_split(
-    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
-    int *partition_horz_allowed, int *partition_vert_allowed,
-    int *do_rectangular_split, int *do_square_split);
+void av1_simple_motion_search_based_split(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                          PartitionSearchState *part_state);
 
 // Performs a simple_motion_search with two reference frames and extract
 // the variance of residues. Then use the features to determine whether we want
 // to prune some partitions.
-void av1_simple_motion_search_prune_rect(
-    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, int partition_horz_allowed,
-    int partition_vert_allowed, int *prune_horz, int *prune_vert);
+void av1_simple_motion_search_prune_rect(AV1_COMP *const cpi, MACROBLOCK *x,
+                                         SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                         PartitionSearchState *part_state);
 
 #if !CONFIG_REALTIME_ONLY
 // Early terminates PARTITION_NONE using simple_motion_search features and the
@@ -97,10 +43,11 @@
 //  - The frame is not intra only
 //  - The current bsize is > BLOCK_8X8
 //  - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
-void av1_simple_motion_search_early_term_none(
-    AV1_COMP *const cpi, MACROBLOCK *x, SIMPLE_MOTION_DATA_TREE *sms_tree,
-    int mi_row, int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
-    int *early_terminate);
+void av1_simple_motion_search_early_term_none(AV1_COMP *const cpi,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_tree,
+                                              const RD_STATS *none_rdc,
+                                              PartitionSearchState *part_state);
 
 // Get the features for selecting the max and min partition size. Currently this
 // performs simple_motion_search on 16X16 subblocks of the current superblock,
@@ -117,11 +64,10 @@
 // Attempts an early termination after PARTITION_SPLIT.
 void av1_ml_early_term_after_split(AV1_COMP *const cpi, MACROBLOCK *const x,
                                    SIMPLE_MOTION_DATA_TREE *const sms_tree,
-                                   BLOCK_SIZE bsize, int64_t best_rd,
-                                   int64_t part_none_rd, int64_t part_split_rd,
-                                   int64_t *split_block_rd, int mi_row,
-                                   int mi_col,
-                                   int *const terminate_partition_search);
+                                   int64_t best_rd, int64_t part_none_rd,
+                                   int64_t part_split_rd,
+                                   int64_t *split_block_rd,
+                                   PartitionSearchState *part_state);
 
 // Use the rdcost ratio and source var ratio to prune PARTITION_HORZ and
 // PARTITION_VERT.
@@ -129,45 +75,38 @@
 // no information about rectangular partitions. Preliminary experiments suggest
 // that we can get better performance by adding in q_index and rectangular
 // sse/var from SMS. We should retrain and tune this model later.
-void av1_ml_prune_rect_partition(const AV1_COMP *const cpi,
-                                 const MACROBLOCK *const x, BLOCK_SIZE bsize,
+void av1_ml_prune_rect_partition(AV1_COMP *const cpi, const MACROBLOCK *const x,
                                  int64_t best_rd, int64_t none_rd,
-                                 int64_t *split_rd, int *const dst_prune_horz,
-                                 int *const dst_prune_vert);
+                                 const int64_t *split_rd,
+                                 PartitionSearchState *part_state);
 
 // Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
 // considered.
-void av1_ml_prune_ab_partition(
-    BLOCK_SIZE bsize, int part_ctx, int var_ctx, int64_t best_rd,
-    int64_t horz_rd[SUB_PARTITIONS_RECT], int64_t vert_rd[SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const horza_partition_allowed,
-    int *const horzb_partition_allowed, int *const verta_partition_allowed,
-    int *const vertb_partition_allowed);
+void av1_ml_prune_ab_partition(AV1_COMP *const cpi, int part_ctx, int var_ctx,
+                               int64_t best_rd,
+                               PartitionSearchState *part_state,
+                               int *ab_partitions_allowed);
 
 // Use a ML model to predict if horz4 and vert4 should be considered.
-void av1_ml_prune_4_partition(
-    const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize,
-    int part_ctx, int64_t best_rd,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT], int *const partition_horz4_allowed,
-    int *const partition_vert4_allowed, unsigned int pb_source_variance,
-    int mi_row, int mi_col);
+void av1_ml_prune_4_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+                              int part_ctx, int64_t best_rd,
+                              PartitionSearchState *part_state,
+                              int *part4_allowed,
+                              unsigned int pb_source_variance);
 
 // ML-based partition search breakout after PARTITION_NONE.
-int av1_ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            const MACROBLOCK *const x,
-                            const RD_STATS *const rd_stats,
-                            unsigned int pb_source_variance, int bit_depth);
+void av1_ml_predict_breakout(AV1_COMP *const cpi, const MACROBLOCK *const x,
+                             const RD_STATS *const rd_stats,
+                             unsigned int pb_source_variance, int bit_depth,
+                             PartitionSearchState *part_state);
 
 // The first round of partition pruning determined before any partition
 // has been tested. The decisions will be updated and passed back
 // to the partition search function.
-void av1_prune_partitions_before_search(
-    AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col,
-    BLOCK_SIZE bsize, SIMPLE_MOTION_DATA_TREE *const sms_tree,
-    int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_rectangular_split,
-    int *do_square_split, int *prune_horz, int *prune_vert);
+void av1_prune_partitions_before_search(AV1_COMP *const cpi,
+                                        MACROBLOCK *const x,
+                                        SIMPLE_MOTION_DATA_TREE *const sms_tree,
+                                        PartitionSearchState *part_state);
 
 // Prune out partitions that lead to coding block sizes outside the min and max
 // bsizes set by the encoder. Max and min square partition levels are defined as
@@ -175,22 +114,24 @@
 // reach. To implement this: only PARTITION_NONE is allowed if the current node
 // equals max_partition_size, only PARTITION_SPLIT is allowed if the current
 // node exceeds max_partition_size.
-void av1_prune_partitions_by_max_min_bsize(
-    SuperBlockEnc *sb_enc, BLOCK_SIZE bsize, int is_not_edge_block,
-    int *partition_none_allowed, int *partition_horz_allowed,
-    int *partition_vert_allowed, int *do_square_split);
+void av1_prune_partitions_by_max_min_bsize(SuperBlockEnc *sb_enc,
+                                           PartitionSearchState *part_state);
 
 // Prune out AB partitions based on rd decisions made from testing the
 // basic partitions.
-void av1_prune_ab_partitions(
-    const AV1_COMP *cpi, const MACROBLOCK *x, const PC_TREE *pc_tree,
-    BLOCK_SIZE bsize, int pb_source_variance, int64_t best_rdcost,
-    int64_t rect_part_rd[NUM_RECT_PARTS][SUB_PARTITIONS_RECT],
-    int64_t split_rd[SUB_PARTITIONS_SPLIT],
-    const RD_RECT_PART_WIN_INFO *rect_part_win_info, int ext_partition_allowed,
-    int partition_horz_allowed, int partition_vert_allowed,
-    int *horza_partition_allowed, int *horzb_partition_allowed,
-    int *verta_partition_allowed, int *vertb_partition_allowed);
+void av1_prune_ab_partitions(AV1_COMP *cpi, const MACROBLOCK *x,
+                             const PC_TREE *pc_tree, int pb_source_variance,
+                             int64_t best_rdcost,
+                             const RD_RECT_PART_WIN_INFO *rect_part_win_info,
+                             bool ext_partition_allowed,
+                             PartitionSearchState *part_state,
+                             int *ab_partitions_allowed);
+
+void av1_collect_motion_search_features_sb(AV1_COMP *const cpi, ThreadData *td,
+                                           TileDataEnc *tile_data,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize,
+                                           aom_partition_features_t *features);
 #endif  // !CONFIG_REALTIME_ONLY
 
 // A simplified version of set_offsets meant to be used for
@@ -236,21 +177,11 @@
   av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
 }
 
-static INLINE void init_simple_motion_search_mvs(
-    SIMPLE_MOTION_DATA_TREE *sms_tree) {
-  av1_zero(sms_tree->start_mvs);
-  av1_zero(sms_tree->sms_none_feat);
-  av1_zero(sms_tree->sms_rect_feat);
-  av1_zero(sms_tree->sms_none_valid);
-  av1_zero(sms_tree->sms_rect_valid);
-
-  if (sms_tree->block_size >= BLOCK_8X8) {
-    init_simple_motion_search_mvs(sms_tree->split[0]);
-    init_simple_motion_search_mvs(sms_tree->split[1]);
-    init_simple_motion_search_mvs(sms_tree->split[2]);
-    init_simple_motion_search_mvs(sms_tree->split[3]);
-  }
-}
+void av1_init_simple_motion_search_mvs_for_sb(const AV1_COMP *cpi,
+                                              const TileInfo *tile_info,
+                                              MACROBLOCK *x,
+                                              SIMPLE_MOTION_DATA_TREE *sms_root,
+                                              int mi_row, int mi_col);
 
 static INLINE int is_full_sb(const CommonModeInfoParams *const mi_params,
                              int mi_row, int mi_col, BLOCK_SIZE sb_size) {
@@ -261,22 +192,66 @@
          (mi_col + sb_mi_wide) <= mi_params->mi_cols;
 }
 
+#if !CONFIG_REALTIME_ONLY
 // Do not use this criteria for screen content videos.
 // Since screen content videos could often find good predictors and the largest
 // block size is likely to be used.
 static INLINE int use_auto_max_partition(const AV1_COMP *const cpi,
                                          BLOCK_SIZE sb_size, int mi_row,
                                          int mi_col) {
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const AV1_COMMON *const cm = &cpi->common;
   return !frame_is_intra_only(cm) && !cpi->use_screen_content_tools &&
          cpi->sf.part_sf.auto_max_partition_based_on_simple_motion !=
              NOT_IN_USE &&
          sb_size == BLOCK_128X128 &&
          is_full_sb(&cm->mi_params, mi_row, mi_col, sb_size) &&
-         cpi->gf_group.update_type[cpi->gf_frame_index] != OVERLAY_UPDATE &&
-         cpi->gf_group.update_type[cpi->gf_frame_index] != INTNL_OVERLAY_UPDATE;
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             OVERLAY_UPDATE &&
+         cpi->ppi->gf_group.update_type[cpi->gf_frame_index] !=
+             INTNL_OVERLAY_UPDATE;
 }
 
+static BLOCK_SIZE dim_to_size(int dim) {
+  switch (dim) {
+    case 4: return BLOCK_4X4;
+    case 8: return BLOCK_8X8;
+    case 16: return BLOCK_16X16;
+    case 32: return BLOCK_32X32;
+    case 64: return BLOCK_64X64;
+    case 128: return BLOCK_128X128;
+    default: assert(0); return 0;
+  }
+}
+
+static AOM_INLINE void set_max_min_partition_size(SuperBlockEnc *sb_enc,
+                                                  AV1_COMP *cpi, MACROBLOCK *x,
+                                                  const SPEED_FEATURES *sf,
+                                                  BLOCK_SIZE sb_size,
+                                                  int mi_row, int mi_col) {
+  const AV1_COMMON *cm = &cpi->common;
+
+  sb_enc->max_partition_size =
+      AOMMIN(sf->part_sf.default_max_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.max_partition_size));
+  sb_enc->min_partition_size =
+      AOMMAX(sf->part_sf.default_min_partition_size,
+             dim_to_size(cpi->oxcf.part_cfg.min_partition_size));
+  sb_enc->max_partition_size =
+      AOMMIN(sb_enc->max_partition_size, cm->seq_params->sb_size);
+  sb_enc->min_partition_size =
+      AOMMIN(sb_enc->min_partition_size, cm->seq_params->sb_size);
+
+  if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+    float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+    av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+    sb_enc->max_partition_size =
+        AOMMAX(AOMMIN(av1_predict_max_partition(cpi, x, features),
+                      sb_enc->max_partition_size),
+               sb_enc->min_partition_size);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
 #endif  // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_

diff --git a/av1/encoder/pass2_strategy.c b/av1/encoder/pass2_strategy.c
index 4cd2ea3..134c6ce 100644
--- a/av1/encoder/pass2_strategy.c
+++ b/av1/encoder/pass2_strategy.c

@@ -19,14 +19,13 @@
 
 #include <stdint.h>
 
+#include "av1/encoder/thirdpass.h"
 #include "config/aom_config.h"
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
 #include "aom/aom_encoder.h"
 
-#include "aom_ports/system_state.h"
-
 #include "av1/common/av1_common_int.h"
 
 #include "av1/encoder/encoder.h"
@@ -37,12 +36,12 @@
 #include "av1/encoder/rc_utils.h"
 #include "av1/encoder/temporal_filter.h"
 #include "av1/encoder/tpl_model.h"
-#include "av1/encoder/use_flat_gop_model_params.h"
 #include "av1/encoder/encode_strategy.h"
 
 #define DEFAULT_KF_BOOST 2300
 #define DEFAULT_GF_BOOST 2000
 #define GROUP_ADAPTIVE_MAXQ 1
+
 static void init_gf_stats(GF_GROUP_STATS *gf_stats);
 
 // Calculate an active area of the image that discounts formatting
@@ -61,20 +60,21 @@
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
 #define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const FRAME_INFO *frame_info,
-                                     const TWO_PASS *twopass,
-                                     const AV1EncoderConfig *oxcf,
-                                     const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = twopass->stats_buf_ctx->total_stats;
-  if (stats == NULL) {
+static double calculate_modified_err_new(const FRAME_INFO *frame_info,
+                                         const FIRSTPASS_STATS *total_stats,
+                                         const FIRSTPASS_STATS *this_stats,
+                                         int vbrbias, double modified_error_min,
+                                         double modified_error_max) {
+  if (total_stats == NULL) {
     return 0;
   }
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  const double av_weight = total_stats->weight / total_stats->count;
+  const double av_err =
+      (total_stats->coded_error * av_weight) / total_stats->count;
   double modified_error =
-      av_err * pow(this_frame->coded_error * this_frame->weight /
+      av_err * pow(this_stats->coded_error * this_stats->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
-                   oxcf->rc_cfg.vbrbias / 100.0);
+                   vbrbias / 100.0);
 
   // Correction for active area. Frames with a reduced active area
   // (eg due to formatting bars) have a higher error per mb for the
@@ -82,73 +82,62 @@
   // 0.5N blocks of complexity 2X is a little easier than coding N
   // blocks of complexity X.
   modified_error *=
-      pow(calculate_active_area(frame_info, this_frame), ACT_AREA_CORRECTION);
+      pow(calculate_active_area(frame_info, this_stats), ACT_AREA_CORRECTION);
 
-  return fclamp(modified_error, twopass->modified_error_min,
-                twopass->modified_error_max);
+  return fclamp(modified_error, modified_error_min, modified_error_max);
+}
+
+static double calculate_modified_err(const FRAME_INFO *frame_info,
+                                     const TWO_PASS *twopass,
+                                     const AV1EncoderConfig *oxcf,
+                                     const FIRSTPASS_STATS *this_frame) {
+  const FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
+  return calculate_modified_err_new(
+      frame_info, total_stats, this_frame, oxcf->rc_cfg.vbrbias,
+      twopass->modified_error_min, twopass->modified_error_max);
 }
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
-static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
-  p->stats_in = position;
+static void reset_fpf_position(TWO_PASS_FRAME *p_frame,
+                               const FIRSTPASS_STATS *position) {
+  p_frame->stats_in = position;
 }
 
-static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+static int input_stats(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                       FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
 
-  *fps = *p->stats_in;
-  ++p->stats_in;
+  *fps = *p_frame->stats_in;
+  ++p_frame->stats_in;
   return 1;
 }
 
-static int input_stats_lap(TWO_PASS *p, FIRSTPASS_STATS *fps) {
-  if (p->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
+static int input_stats_lap(TWO_PASS *p, TWO_PASS_FRAME *p_frame,
+                           FIRSTPASS_STATS *fps) {
+  if (p_frame->stats_in >= p->stats_buf_ctx->stats_in_end) return EOF;
 
-  *fps = *p->stats_in;
+  *fps = *p_frame->stats_in;
   /* Move old stats[0] out to accommodate for next frame stats  */
   memmove(p->frame_stats_arr[0], p->frame_stats_arr[1],
-          (p->stats_buf_ctx->stats_in_end - p->stats_in - 1) *
+          (p->stats_buf_ctx->stats_in_end - p_frame->stats_in - 1) *
               sizeof(FIRSTPASS_STATS));
   p->stats_buf_ctx->stats_in_end--;
   return 1;
 }
 
 // Read frame stats at an offset from the current position.
-static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
-  if ((offset >= 0 && p->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
-      (offset < 0 && p->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p,
+                                               const TWO_PASS_FRAME *p_frame,
+                                               int offset) {
+  if ((offset >= 0 &&
+       p_frame->stats_in + offset >= p->stats_buf_ctx->stats_in_end) ||
+      (offset < 0 &&
+       p_frame->stats_in + offset < p->stats_buf_ctx->stats_in_start)) {
     return NULL;
   }
 
-  return &p->stats_in[offset];
-}
-
-static void subtract_stats(FIRSTPASS_STATS *section,
-                           const FIRSTPASS_STATS *frame) {
-  section->frame -= frame->frame;
-  section->weight -= frame->weight;
-  section->intra_error -= frame->intra_error;
-  section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
-  section->coded_error -= frame->coded_error;
-  section->sr_coded_error -= frame->sr_coded_error;
-  section->pcnt_inter -= frame->pcnt_inter;
-  section->pcnt_motion -= frame->pcnt_motion;
-  section->pcnt_second_ref -= frame->pcnt_second_ref;
-  section->pcnt_neutral -= frame->pcnt_neutral;
-  section->intra_skip_pct -= frame->intra_skip_pct;
-  section->inactive_zone_rows -= frame->inactive_zone_rows;
-  section->inactive_zone_cols -= frame->inactive_zone_cols;
-  section->MVr -= frame->MVr;
-  section->mvr_abs -= frame->mvr_abs;
-  section->MVc -= frame->MVc;
-  section->mvc_abs -= frame->mvc_abs;
-  section->MVrv -= frame->MVrv;
-  section->MVcv -= frame->MVcv;
-  section->mv_in_out_count -= frame->mv_in_out_count;
-  section->new_mv_count -= frame->new_mv_count;
-  section->count -= frame->count;
-  section->duration -= frame->duration;
+  return &p_frame->stats_in[offset];
 }
 
 // This function returns the maximum target rate per frame.
@@ -182,9 +171,8 @@
 
 // Based on history adjust expectations of bits per macroblock.
 static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) {
-  TWO_PASS *twopass = &cpi->twopass;
-  const RATE_CONTROL *const rc = &cpi->rc;
-  int err_estimate = rc->rate_error_estimate;
+  TWO_PASS *twopass = &cpi->ppi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   // Based on recent history adjust expectations of bits per macroblock.
   double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0);
@@ -192,16 +180,17 @@
   const double adj_limit = AOMMAX(0.20, (double)(100 - rate_err_tol) / 200.0);
   const double min_fac = 1.0 - adj_limit;
   const double max_fac = 1.0 + adj_limit;
+  int err_estimate = p_rc->rate_error_estimate;
 
-  if (rc->vbr_bits_off_target && rc->total_actual_bits > 0) {
-    if (cpi->lap_enabled) {
+  if (p_rc->vbr_bits_off_target && p_rc->total_actual_bits > 0) {
+    if (cpi->ppi->lap_enabled) {
       rate_err_factor =
           (double)twopass->rolling_arf_group_actual_bits /
           DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
     } else {
       rate_err_factor =
-          1.0 - ((double)(rc->vbr_bits_off_target) /
-                 AOMMAX(rc->total_actual_bits, cpi->twopass.bits_left));
+          1.0 - ((double)(p_rc->vbr_bits_off_target) /
+                 AOMMAX(p_rc->total_actual_bits, cpi->ppi->twopass.bits_left));
     }
 
     rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor));
@@ -209,7 +198,7 @@
     // Adjustment is damped if this is 1 pass with look ahead processing
     // (as there are only ever a few frames of data) and for all but the first
     // GOP in normal two pass.
-    if ((twopass->bpm_factor != 1.0) || cpi->lap_enabled) {
+    if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) {
       rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac);
     }
   }
@@ -282,7 +271,7 @@
   const RATE_CONTROL *const rc = &cpi->rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-  inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+  inactive_zone = fclamp(inactive_zone, 0.0, 0.9999);
 
   if (av_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
@@ -291,7 +280,7 @@
                             ? cpi->initial_mbs
                             : cpi->common.mi_params.MBs;
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
-    const double av_err_per_mb = av_frame_err / active_mbs;
+    const double av_err_per_mb = av_frame_err / (1.0 - inactive_zone);
     const int target_norm_bits_per_mb =
         (int)((uint64_t)av_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
     int rate_err_tol = AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
@@ -302,9 +291,9 @@
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     int q = find_qindex_by_rate_with_correction(
-        target_norm_bits_per_mb, cpi->common.seq_params.bit_depth,
-        av_err_per_mb, cpi->twopass.bpm_factor, rate_err_tol, rc->best_quality,
-        rc->worst_quality);
+        target_norm_bits_per_mb, cpi->common.seq_params->bit_depth,
+        av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol,
+        rc->best_quality, rc->worst_quality);
 
     // Restriction on active max q for constrained quality mode.
     if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level);
@@ -312,57 +301,63 @@
   }
 }
 
-#define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
 #define INTRA_PART 0.005
 #define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
+#define LOW_SR_DIFF_TRHESH 0.01
 #define NCOUNT_FRAME_II_THRESH 5.0
+#define LOW_CODED_ERR_PER_MB 0.01
 
-static double get_sr_decay_rate(const FRAME_INFO *frame_info,
-                                const FIRSTPASS_STATS *frame) {
-  const int num_mbs = frame_info->num_mbs;
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+/* This function considers how the quality of prediction may be deteriorating
+ * with distance. It comapres the coded error for the last frame and the
+ * second reference frame (usually two frames old) and also applies a factor
+ * based on the extent of INTRA coding.
+ *
+ * The decay factor is then used to reduce the contribution of frames further
+ * from the alt-ref or golden frame, to the bitframe boost calculation for that
+ * alt-ref or golden frame.
+ */
+static double get_sr_decay_rate(const FIRSTPASS_STATS *frame) {
+  double sr_diff = (frame->sr_coded_error - frame->coded_error);
   double sr_decay = 1.0;
   double modified_pct_inter;
   double modified_pcnt_intra;
-  const double motion_amplitude_factor =
-      frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
 
   modified_pct_inter = frame->pcnt_inter;
-  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
-      (double)NCOUNT_FRAME_II_THRESH) {
+  if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
+      ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+       (double)NCOUNT_FRAME_II_THRESH)) {
     modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
   }
   modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
-    sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
-    sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
-               (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * modified_pcnt_intra);
+    double sr_diff_part = ((sr_diff * 0.25) / frame->intra_error);
+    sr_decay = 1.0 - sr_diff_part - (INTRA_PART * modified_pcnt_intra);
   }
-  return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+  return AOMMAX(sr_decay, DEFAULT_DECAY_LIMIT);
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const FRAME_INFO *frame_info,
-                                     const FIRSTPASS_STATS *frame) {
+static double get_zero_motion_factor(const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(frame_info, frame);
+  double sr_decay = get_sr_decay_rate(frame);
   return AOMMIN(sr_decay, zero_motion_pct);
 }
 
-#define ZM_POWER_FACTOR 0.75
+#define DEFAULT_ZM_FACTOR 0.5
+static double get_prediction_decay_rate(const FIRSTPASS_STATS *frame_stats) {
+  const double sr_decay_rate = get_sr_decay_rate(frame_stats);
+  double zero_motion_factor =
+      DEFAULT_ZM_FACTOR * (frame_stats->pcnt_inter - frame_stats->pcnt_motion);
 
-static double get_prediction_decay_rate(const FRAME_INFO *frame_info,
-                                        const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(frame_info, next_frame);
-  const double zero_motion_factor =
-      (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
-                  ZM_POWER_FACTOR));
+  // Clamp value to range 0.0 to 1.0
+  // This should happen anyway if input values are sensibly clamped but checked
+  // here just in case.
+  if (zero_motion_factor > 1.0)
+    zero_motion_factor = 1.0;
+  else if (zero_motion_factor < 0.0)
+    zero_motion_factor = 0.0;
 
   return AOMMAX(zero_motion_factor,
                 (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
@@ -371,7 +366,8 @@
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(TWO_PASS *const twopass,
+static int detect_transition_to_still(const FIRSTPASS_INFO *firstpass_info,
+                                      int next_stats_index,
                                       const int min_gf_interval,
                                       const int frame_interval,
                                       const int still_interval,
@@ -382,16 +378,19 @@
   // instead of a clean scene cut.
   if (frame_interval > min_gf_interval && loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
-    int j;
-    // Look ahead a few frames to see if static condition persists...
-    for (j = 0; j < still_interval; ++j) {
-      const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
-      if (stats >= twopass->stats_buf_ctx->stats_in_end) break;
-
-      if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+    int stats_left =
+        av1_firstpass_info_future_count(firstpass_info, next_stats_index);
+    if (stats_left >= still_interval) {
+      int j;
+      // Look ahead a few frames to see if static condition persists...
+      for (j = 0; j < still_interval; ++j) {
+        const FIRSTPASS_STATS *stats =
+            av1_firstpass_info_peek(firstpass_info, next_stats_index + j);
+        if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+      }
+      // Only if it does do we signal a transition to still.
+      return j == still_interval;
     }
-    // Only if it does do we signal a transition to still.
-    return j == still_interval;
   }
   return 0;
 }
@@ -399,8 +398,10 @@
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this.
-static int detect_flash(const TWO_PASS *twopass, const int offset) {
-  const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+static int detect_flash(const TWO_PASS *twopass,
+                        const TWO_PASS_FRAME *twopass_frame, const int offset) {
+  const FIRSTPASS_STATS *const next_frame =
+      read_frame_stats(twopass, twopass_frame, offset);
 
   // What we are looking for here is a situation where there is a
   // brief break in prediction (such as a flash) but subsequent frames
@@ -414,7 +415,8 @@
 
 // Update the motion related elements to the GF arf boost calculation.
 static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
-                                          GF_GROUP_STATS *gf_stats) {
+                                          GF_GROUP_STATS *gf_stats, double f_w,
+                                          double f_h) {
   const double pct = stats->pcnt_motion;
 
   // Accumulate Motion In/Out of frame stats.
@@ -431,9 +433,11 @@
         fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
 
     gf_stats->mv_ratio_accumulator +=
-        pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+        pct *
+        (mvr_ratio < stats->mvr_abs * f_h ? mvr_ratio : stats->mvr_abs * f_h);
     gf_stats->mv_ratio_accumulator +=
-        pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+        pct *
+        (mvc_ratio < stats->mvc_abs * f_w ? mvc_ratio : stats->mvc_abs * f_w);
   }
 }
 
@@ -449,17 +453,15 @@
 }
 
 static void accumulate_next_frame_stats(const FIRSTPASS_STATS *stats,
-                                        const FRAME_INFO *frame_info,
                                         const int flash_detected,
                                         const int frames_since_key,
                                         const int cur_idx,
-                                        GF_GROUP_STATS *gf_stats) {
-  accumulate_frame_motion_stats(stats, gf_stats);
+                                        GF_GROUP_STATS *gf_stats, int f_w,
+                                        int f_h) {
+  accumulate_frame_motion_stats(stats, gf_stats, f_w, f_h);
   // sum up the metric values of current gf group
   gf_stats->avg_sr_coded_error += stats->sr_coded_error;
-  gf_stats->avg_tr_coded_error += stats->tr_coded_error;
   gf_stats->avg_pcnt_second_ref += stats->pcnt_second_ref;
-  gf_stats->avg_pcnt_third_ref += stats->pcnt_third_ref;
   gf_stats->avg_new_mv_count += stats->new_mv_count;
   gf_stats->avg_wavelet_energy += stats->frame_avg_wavelet_energy;
   if (fabs(stats->raw_error_stdev) > 0.000001) {
@@ -470,36 +472,23 @@
   // Accumulate the effect of prediction quality decay
   if (!flash_detected) {
     gf_stats->last_loop_decay_rate = gf_stats->loop_decay_rate;
-    gf_stats->loop_decay_rate = get_prediction_decay_rate(frame_info, stats);
+    gf_stats->loop_decay_rate = get_prediction_decay_rate(stats);
 
     gf_stats->decay_accumulator =
         gf_stats->decay_accumulator * gf_stats->loop_decay_rate;
 
     // Monitor for static sections.
     if ((frames_since_key + cur_idx - 1) > 1) {
-      gf_stats->zero_motion_accumulator =
-          AOMMIN(gf_stats->zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, stats));
+      gf_stats->zero_motion_accumulator = AOMMIN(
+          gf_stats->zero_motion_accumulator, get_zero_motion_factor(stats));
     }
   }
 }
 
-static void average_gf_stats(const int total_frame,
-                             const FIRSTPASS_STATS *last_stat,
-                             GF_GROUP_STATS *gf_stats) {
+static void average_gf_stats(const int total_frame, GF_GROUP_STATS *gf_stats) {
   if (total_frame) {
     gf_stats->avg_sr_coded_error /= total_frame;
-    gf_stats->avg_tr_coded_error /= total_frame;
     gf_stats->avg_pcnt_second_ref /= total_frame;
-    if (total_frame - 1) {
-      gf_stats->avg_pcnt_third_ref_nolast =
-          (gf_stats->avg_pcnt_third_ref - last_stat->pcnt_third_ref) /
-          (total_frame - 1);
-    } else {
-      gf_stats->avg_pcnt_third_ref_nolast =
-          gf_stats->avg_pcnt_third_ref / total_frame;
-    }
-    gf_stats->avg_pcnt_third_ref /= total_frame;
     gf_stats->avg_new_mv_count /= total_frame;
     gf_stats->avg_wavelet_energy /= total_frame;
   }
@@ -508,36 +497,6 @@
     gf_stats->avg_raw_err_stdev /= gf_stats->non_zero_stdev_count;
 }
 
-static void get_features_from_gf_stats(const GF_GROUP_STATS *gf_stats,
-                                       const GF_FRAME_STATS *first_frame,
-                                       const GF_FRAME_STATS *last_frame,
-                                       const int num_mbs,
-                                       const int constrained_gf_group,
-                                       const int kf_zeromotion_pct,
-                                       const int num_frames, float *features) {
-  *features++ = (float)gf_stats->abs_mv_in_out_accumulator;
-  *features++ = (float)(gf_stats->avg_new_mv_count / num_mbs);
-  *features++ = (float)gf_stats->avg_pcnt_second_ref;
-  *features++ = (float)gf_stats->avg_pcnt_third_ref;
-  *features++ = (float)gf_stats->avg_pcnt_third_ref_nolast;
-  *features++ = (float)(gf_stats->avg_sr_coded_error / num_mbs);
-  *features++ = (float)(gf_stats->avg_tr_coded_error / num_mbs);
-  *features++ = (float)(gf_stats->avg_wavelet_energy / num_mbs);
-  *features++ = (float)(constrained_gf_group);
-  *features++ = (float)gf_stats->decay_accumulator;
-  *features++ = (float)(first_frame->frame_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_sr_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_tr_coded_error / num_mbs);
-  *features++ = (float)(first_frame->frame_err / num_mbs);
-  *features++ = (float)(kf_zeromotion_pct);
-  *features++ = (float)(last_frame->frame_coded_error / num_mbs);
-  *features++ = (float)(last_frame->frame_sr_coded_error / num_mbs);
-  *features++ = (float)(last_frame->frame_tr_coded_error / num_mbs);
-  *features++ = (float)num_frames;
-  *features++ = (float)gf_stats->mv_ratio_accumulator;
-  *features++ = (float)gf_stats->non_zero_stdev_count;
-}
-
 #define BOOST_FACTOR 12.5
 static double baseline_err_per_mb(const FRAME_INFO *frame_info) {
   unsigned int screen_area = frame_info->frame_height * frame_info->frame_width;
@@ -551,22 +510,18 @@
   }
 }
 
-static double calc_frame_boost(const RATE_CONTROL *rc,
+static double calc_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
                                const FRAME_INFO *frame_info,
                                const FIRSTPASS_STATS *this_frame,
                                double this_frame_mv_in_out, double max_boost) {
   double frame_boost;
-  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
                                             frame_info->bit_depth);
   const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
   const double active_area = calculate_active_area(frame_info, this_frame);
-  int num_mbs = frame_info->num_mbs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
                        this_frame->intra_error * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
   frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
@@ -583,22 +538,18 @@
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static double calc_kf_frame_boost(const RATE_CONTROL *rc,
+static double calc_kf_frame_boost(const PRIMARY_RATE_CONTROL *p_rc,
                                   const FRAME_INFO *frame_info,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator, double max_boost) {
   double frame_boost;
-  const double lq = av1_convert_qindex_to_q(rc->avg_frame_qindex[INTER_FRAME],
+  const double lq = av1_convert_qindex_to_q(p_rc->avg_frame_qindex[INTER_FRAME],
                                             frame_info->bit_depth);
   const double boost_q_correction = AOMMIN((0.50 + (lq * 0.015)), 2.00);
   const double active_area = calculate_active_area(frame_info, this_frame);
-  int num_mbs = frame_info->num_mbs;
-
-  // Correct for any inactive region in the image
-  num_mbs = (int)AOMMAX(1, num_mbs * active_area);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * num_mbs,
+  frame_boost = AOMMAX(baseline_err_per_mb(frame_info) * active_area,
                        this_frame->intra_error * active_area) /
                 DOUBLE_DIVIDE_CHECK(
                     (this_frame->coded_error + *sr_accumulator) * active_area);
@@ -618,8 +569,8 @@
   return AOMMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static int get_projected_gfu_boost(const RATE_CONTROL *rc, int gfu_boost,
-                                   int frames_to_project,
+static int get_projected_gfu_boost(const PRIMARY_RATE_CONTROL *p_rc,
+                                   int gfu_boost, int frames_to_project,
                                    int num_stats_used_for_gfu_boost) {
   /*
    * If frames_to_project is equal to num_stats_used_for_gfu_boost,
@@ -629,7 +580,7 @@
    */
   if (num_stats_used_for_gfu_boost >= frames_to_project) return gfu_boost;
 
-  double min_boost_factor = sqrt(rc->baseline_gf_interval);
+  double min_boost_factor = sqrt(p_rc->baseline_gf_interval);
   // Get the current tpl factor (number of frames = frames_to_project).
   double tpl_factor = av1_get_gfu_boost_projection_factor(
       min_boost_factor, MAX_GFUBOOST_FACTOR, frames_to_project);
@@ -642,11 +593,14 @@
 }
 
 #define GF_MAX_BOOST 90.0
+#define GF_MIN_BOOST 50
 #define MIN_DECAY_FACTOR 0.01
-int av1_calc_arf_boost(const TWO_PASS *twopass, const RATE_CONTROL *rc,
-                       FRAME_INFO *frame_info, int offset, int f_frames,
-                       int b_frames, int *num_fpstats_used,
-                       int *num_fpstats_required) {
+int av1_calc_arf_boost(const TWO_PASS *twopass,
+                       const TWO_PASS_FRAME *twopass_frame,
+                       const PRIMARY_RATE_CONTROL *p_rc, FRAME_INFO *frame_info,
+                       int offset, int f_frames, int b_frames,
+                       int *num_fpstats_used, int *num_fpstats_required,
+                       int project_gfu_boost) {
   int i;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
@@ -657,21 +611,23 @@
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(this_frame, &gf_stats);
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -679,7 +635,7 @@
 
     boost_score +=
         gf_stats.decay_accumulator *
-        calc_frame_boost(rc, frame_info, this_frame,
+        calc_frame_boost(p_rc, frame_info, this_frame,
                          gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
     if (num_fpstats_used) (*num_fpstats_used)++;
   }
@@ -691,21 +647,23 @@
   init_gf_stats(&gf_stats);
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame =
+        read_frame_stats(twopass, twopass_frame, i + offset);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
-    accumulate_frame_motion_stats(this_frame, &gf_stats);
+    accumulate_frame_motion_stats(this_frame, &gf_stats,
+                                  frame_info->frame_width,
+                                  frame_info->frame_height);
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, twopass_frame, i + offset) ||
+                     detect_flash(twopass, twopass_frame, i + offset + 1);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      gf_stats.decay_accumulator *=
-          get_prediction_decay_rate(frame_info, this_frame);
+      gf_stats.decay_accumulator *= get_prediction_decay_rate(this_frame);
       gf_stats.decay_accumulator = gf_stats.decay_accumulator < MIN_DECAY_FACTOR
                                        ? MIN_DECAY_FACTOR
                                        : gf_stats.decay_accumulator;
@@ -713,22 +671,22 @@
 
     boost_score +=
         gf_stats.decay_accumulator *
-        calc_frame_boost(rc, frame_info, this_frame,
+        calc_frame_boost(p_rc, frame_info, this_frame,
                          gf_stats.this_frame_mv_in_out, GF_MAX_BOOST);
     if (num_fpstats_used) (*num_fpstats_used)++;
   }
   arf_boost += (int)boost_score;
 
-  if (num_fpstats_required) {
+  if (project_gfu_boost) {
+    assert(num_fpstats_required != NULL);
+    assert(num_fpstats_used != NULL);
     *num_fpstats_required = f_frames + b_frames;
-    if (num_fpstats_used) {
-      arf_boost = get_projected_gfu_boost(rc, arf_boost, *num_fpstats_required,
-                                          *num_fpstats_used);
-    }
+    arf_boost = get_projected_gfu_boost(p_rc, arf_boost, *num_fpstats_required,
+                                        *num_fpstats_used);
   }
 
-  if (arf_boost < ((b_frames + f_frames) * 50))
-    arf_boost = ((b_frames + f_frames) * 50);
+  if (arf_boost < ((b_frames + f_frames) * GF_MIN_BOOST))
+    arf_boost = ((b_frames + f_frames) * GF_MIN_BOOST);
 
   return arf_boost;
 }
@@ -767,7 +725,8 @@
 static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
                                              double gf_group_err) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  const TWO_PASS *const twopass = &cpi->twopass;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
   const int max_bits = frame_max_bits(rc, &cpi->oxcf);
   int64_t total_group_bits;
 
@@ -787,8 +746,8 @@
                                : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (total_group_bits > (int64_t)max_bits * p_rc->baseline_gf_interval)
+    total_group_bits = (int64_t)max_bits * p_rc->baseline_gf_interval;
 
   return total_group_bits;
 }
@@ -821,7 +780,6 @@
 // inverse of calculate_boost_bits().
 static int calculate_boost_factor(int frame_count, int bits,
                                   int64_t total_group_bits) {
-  aom_clear_system_state();
   return (int)(100.0 * frame_count * bits / (total_group_bits - bits));
 }
 
@@ -834,7 +792,8 @@
                                               int64_t group_bits,
                                               int frame_type) {
   const AV1_COMMON *const cm = &cpi->common;
-  const SequenceHeader *const seq_params = &cm->seq_params;
+  const SequenceHeader *const seq_params = cm->seq_params;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const int temporal_layer_id = cm->temporal_layer_id;
   const int spatial_layer_id = cm->spatial_layer_id;
   for (int index = 0; index < seq_params->operating_points_cnt_minus_1 + 1;
@@ -845,7 +804,7 @@
     }
 
     const AV1_LEVEL target_level =
-        cpi->level_params.target_seq_level_idx[index];
+        cpi->ppi->level_params.target_seq_level_idx[index];
     if (target_level >= SEQ_LEVELS) continue;
 
     assert(is_valid_seq_level_idx(target_level));
@@ -859,18 +818,20 @@
       const int level_enforced_max_kf_bits = target_bits_per_frame * 8;
       if (bits_assigned > level_enforced_max_kf_bits) {
         const int frames = rc->frames_to_key - 1;
-        rc->kf_boost = calculate_boost_factor(
+        p_rc->kf_boost = calculate_boost_factor(
             frames, level_enforced_max_kf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(frames, rc->kf_boost, group_bits);
+        bits_assigned =
+            calculate_boost_bits(frames, p_rc->kf_boost, group_bits);
       }
     } else if (frame_type == 1) {
       // Maximum bits for arf is 4 times the target_bits_per_frame.
       const int level_enforced_max_arf_bits = target_bits_per_frame * 4;
       if (bits_assigned > level_enforced_max_arf_bits) {
-        rc->gfu_boost = calculate_boost_factor(
-            rc->baseline_gf_interval, level_enforced_max_arf_bits, group_bits);
-        bits_assigned = calculate_boost_bits(rc->baseline_gf_interval,
-                                             rc->gfu_boost, group_bits);
+        p_rc->gfu_boost =
+            calculate_boost_factor(p_rc->baseline_gf_interval,
+                                   level_enforced_max_arf_bits, group_bits);
+        bits_assigned = calculate_boost_bits(p_rc->baseline_gf_interval,
+                                             p_rc->gfu_boost, group_bits);
       }
     } else {
       assert(0);
@@ -883,7 +844,9 @@
 // Allocate bits to each frame in a GF / ARF group
 double layer_fraction[MAX_ARF_LAYERS + 1] = { 1.0,  0.70, 0.55, 0.60,
                                               0.60, 1.0,  1.0 };
-static void allocate_gf_group_bits(GF_GROUP *gf_group, RATE_CONTROL *const rc,
+static void allocate_gf_group_bits(GF_GROUP *gf_group,
+                                   PRIMARY_RATE_CONTROL *const p_rc,
+                                   RATE_CONTROL *const rc,
                                    int64_t gf_group_bits, int gf_arf_bits,
                                    int key_frame, int use_arf) {
   int64_t total_group_bits = gf_group_bits;
@@ -900,7 +863,7 @@
   if (use_arf) total_group_bits -= gf_arf_bits;
 
   int num_frames =
-      AOMMAX(1, rc->baseline_gf_interval - (rc->frames_since_key == 0));
+      AOMMAX(1, p_rc->baseline_gf_interval - (rc->frames_since_key == 0));
   base_frame_bits = (int)(total_group_bits / num_frames);
 
   // Check the number of frames in each layer in case we have a
@@ -968,7 +931,7 @@
                                 int active_min_gf_interval,
                                 GF_GROUP_STATS *gf_stats) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   InitialDimensions *const initial_dimensions = &cpi->initial_dimensions;
   // Motion breakout threshold for loop below depends on image size.
   const double mv_ratio_accumulator_thresh =
@@ -977,9 +940,15 @@
   if (!flash_detected) {
     // Break clause to detect very still sections after motion. For example,
     // a static image after a fade or other transition.
-    if (detect_transition_to_still(
-            twopass, rc->min_gf_interval, frame_index - cur_start, 5,
-            gf_stats->loop_decay_rate, gf_stats->last_loop_decay_rate)) {
+
+    // TODO(angiebird): This is a temporary change, we will avoid using
+    // twopass_frame.stats_in in the follow-up CL
+    int index = (int)(cpi->twopass_frame.stats_in -
+                      twopass->stats_buf_ctx->stats_in_start);
+    if (detect_transition_to_still(&twopass->firstpass_info, index,
+                                   rc->min_gf_interval, frame_index - cur_start,
+                                   5, gf_stats->loop_decay_rate,
+                                   gf_stats->last_loop_decay_rate)) {
       return 1;
     }
   }
@@ -998,13 +967,77 @@
   // so we can continue for more frames.
   if (((frame_index - cur_start) >= active_max_gf_interval + 1) &&
       !is_almost_static(gf_stats->zero_motion_accumulator,
-                        twopass->kf_zeromotion_pct, cpi->lap_enabled)) {
+                        twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled)) {
     return 1;
   }
   return 0;
 }
 
-#define MIN_FWD_KF_INTERVAL 8
+static int is_shorter_gf_interval_better(AV1_COMP *cpi,
+                                         EncodeFrameParams *frame_params,
+                                         const EncodeFrameInput *frame_input) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int gop_length_decision_method = cpi->sf.tpl_sf.gop_length_decision_method;
+  int shorten_gf_interval;
+
+  av1_tpl_preload_rc_estimate(cpi, frame_params);
+
+  if (gop_length_decision_method == 2) {
+    // GF group length is decided based on GF boost and tpl stats of ARFs from
+    // base layer, (base+1) layer.
+    shorten_gf_interval =
+        (p_rc->gfu_boost <
+         p_rc->num_stats_used_for_gfu_boost * GF_MIN_BOOST * 1.4) &&
+        !av1_tpl_setup_stats(cpi, 3, frame_params, frame_input);
+  } else {
+    int do_complete_tpl = 1;
+    GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    int is_temporal_filter_enabled =
+        (rc->frames_since_key > 0 && gf_group->arf_index > -1);
+
+    if (is_temporal_filter_enabled) {
+      int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
+      FRAME_UPDATE_TYPE arf_update_type =
+          gf_group->update_type[gf_group->arf_index];
+      int is_forward_keyframe = 0;
+      av1_temporal_filter(cpi, arf_src_index, arf_update_type,
+                          is_forward_keyframe, NULL, &cpi->ppi->alt_ref_buffer);
+      aom_extend_frame_borders(&cpi->ppi->alt_ref_buffer,
+                               av1_num_planes(&cpi->common));
+    }
+
+    if (gop_length_decision_method == 1) {
+      // Check if tpl stats of ARFs from base layer, (base+1) layer,
+      // (base+2) layer can decide the GF group length.
+      int gop_length_eval =
+          av1_tpl_setup_stats(cpi, 2, frame_params, frame_input);
+
+      if (gop_length_eval != 2) {
+        do_complete_tpl = 0;
+        shorten_gf_interval = !gop_length_eval;
+      }
+    }
+
+    if (do_complete_tpl) {
+      // Decide GF group length based on complete tpl stats.
+      shorten_gf_interval =
+          !av1_tpl_setup_stats(cpi, 1, frame_params, frame_input);
+      // Tpl stats is reused when the ARF is temporally filtered and GF
+      // interval is not shortened.
+      if (is_temporal_filter_enabled && !shorten_gf_interval) {
+        cpi->skip_tpl_setup_stats = 1;
+#if CONFIG_BITRATE_ACCURACY
+        av1_vbr_rc_update_q_index_list(&cpi->vbr_rc_info, &cpi->ppi->tpl_data,
+                                       gf_group, cpi->gf_frame_index,
+                                       cpi->common.seq_params->bit_depth);
+#endif  // CONFIG_BITRATE_ACCURACY
+      }
+    }
+  }
+  return shorten_gf_interval;
+}
+
 #define MIN_SHRINK_LEN 6  // the minimum length of gf if we are shrinking
 #define SMOOTH_FILT_LEN 7
 #define HALF_FILT_LEN (SMOOTH_FILT_LEN / 2)
@@ -1015,17 +1048,16 @@
                                               0.242, 0.061, 0.006 };
 
 // Smooth filter intra_error and coded_error in firstpass stats.
-// If ignore[i]==1, the ith element should not be used in the filtering.
-static void smooth_filter_stats(const FIRSTPASS_STATS *stats, const int *ignore,
-                                int start_idx, int last_idx,
-                                double *filt_intra_err,
+// If stats[i].is_flash==1, the ith element should not be used in the filtering.
+static void smooth_filter_stats(const FIRSTPASS_STATS *stats, int start_idx,
+                                int last_idx, double *filt_intra_err,
                                 double *filt_coded_err) {
   int i, j;
   for (i = start_idx; i <= last_idx; i++) {
     double total_wt = 0;
     for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
-      if (ignore[idx]) continue;
+      if (stats[idx].is_flash) continue;
 
       filt_intra_err[i] +=
           smooth_filt[j + HALF_FILT_LEN] * stats[idx].intra_error;
@@ -1042,7 +1074,7 @@
     for (j = -HALF_FILT_LEN; j <= HALF_FILT_LEN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, start_idx), last_idx);
       // Coded error involves idx and idx - 1.
-      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
 
       filt_coded_err[i] +=
           smooth_filt[j + HALF_FILT_LEN] * stats[idx].coded_error;
@@ -1071,7 +1103,7 @@
 }
 
 static int find_next_scenecut(const FIRSTPASS_STATS *const stats_start,
-                              int first, int last, int *ignore) {
+                              int first, int last) {
   // Identify unstable areas caused by scenecuts.
   // Find the max and 2nd max coded error, and the average of the rest frames.
   // If there is only one frame that yields a huge coded error, it is likely a
@@ -1082,14 +1114,16 @@
   if (last - first == 0) return -1;
 
   for (int i = first; i <= last; i++) {
-    if (ignore[i] || (i > 0 && ignore[i - 1])) continue;
+    if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+      continue;
     double temp_intra = AOMMAX(stats_start[i].intra_error, 0.01);
     this_ratio = stats_start[i].coded_error / temp_intra;
     // find the avg ratio in the preceding neighborhood
     max_prev_ratio = 0;
     max_prev_coded = 0;
     for (int j = AOMMAX(first, i - HALF_WIN); j < i; j++) {
-      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      if (stats_start[j].is_flash || (j > 0 && stats_start[j - 1].is_flash))
+        continue;
       temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
       double temp_ratio = stats_start[j].coded_error / temp_intra;
       if (temp_ratio > max_prev_ratio) {
@@ -1103,7 +1137,8 @@
     max_next_ratio = 0;
     max_next_coded = 0;
     for (int j = i + 1; j <= AOMMIN(i + HALF_WIN, last); j++) {
-      if (ignore[j] || (j > 0 && ignore[j - 1])) continue;
+      if (stats_start[i].is_flash || (i > 0 && stats_start[i - 1].is_flash))
+        continue;
       temp_intra = AOMMAX(stats_start[j].intra_error, 0.01);
       double temp_ratio = stats_start[j].coded_error / temp_intra;
       if (temp_ratio > max_next_ratio) {
@@ -1136,19 +1171,6 @@
   return -1;
 }
 
-static void mark_flashes(const FIRSTPASS_STATS *stats, int start_idx,
-                         int last_idx, int *is_flash) {
-  int i;
-  for (i = start_idx; i < last_idx; i++) {
-    if (stats[i + 1].pcnt_second_ref > stats[i + 1].pcnt_inter &&
-        stats[i + 1].pcnt_second_ref >= 0.5) {
-      // this is a new flash frame
-      is_flash[i] = 1;
-      continue;
-    }
-  }
-}
-
 // Remove the region with index next_region.
 // parameter merge: 0: merge with previous; 1: merge with next; 2:
 // merge with both, take type from previous if possible
@@ -1221,113 +1243,9 @@
   *cur_region_idx = k;
 }
 
-// Estimate the noise variance of each frame from the first pass stats
-// TODO(bohanli): maybe handling of flashes should be done when using the stats,
-// instead of generating them.
-static void estimate_noise(const FIRSTPASS_STATS *stats, const int *is_flash,
-                           int start, int last, double *noise_arr) {
-  double C1, C2, C3, noise;
-  int count = 0;
-  for (int i = AOMMAX(start, 2); i <= last; i++) {
-    noise_arr[i] = 0.0;
-    if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) continue;
-
-    C1 = stats[i - 1].intra_error *
-         (stats[i].intra_error - stats[i].coded_error);
-    C2 = stats[i - 2].intra_error *
-         (stats[i - 1].intra_error - stats[i - 1].coded_error);
-    C3 = stats[i - 2].intra_error *
-         (stats[i].intra_error - stats[i].sr_coded_error);
-    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
-    C1 = sqrt(C1);
-    C2 = sqrt(C2);
-    C3 = sqrt(C3);
-
-    noise = stats[i - 1].intra_error - C1 * C2 / C3;
-    noise = AOMMAX(noise, 0.01);
-    noise_arr[i] = noise;
-    count++;
-  }
-
-  // Copy noise from the neighbor if the noise value is not trustworthy
-  for (int i = AOMMAX(start, 2); i <= last; i++) {
-    if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) continue;
-    if (noise_arr[i] < 1) {
-      int found = 0;
-      for (int c = i + 1; c <= last; c++) {
-        if (is_flash[c] || is_flash[c - 1] || is_flash[c - 2] ||
-            noise_arr[c] < 1)
-          continue;
-        found = 1;
-        noise_arr[i] = noise_arr[c];
-        break;
-      }
-      if (found) continue;
-      for (int c = i - 1; c >= start + 2; c--) {
-        if (is_flash[c] || is_flash[c - 1] || is_flash[c - 2] ||
-            noise_arr[c] < 1)
-          continue;
-        found = 1;
-        noise_arr[i] = noise_arr[c];
-        break;
-      }
-      if (found) continue;
-      noise_arr[i] = 0;
-    }
-  }
-
-  // copy the noise if this is a flash
-  for (int i = AOMMAX(start, 2); i <= last; i++) {
-    if (is_flash[i] || is_flash[i - 1] || is_flash[i - 2]) {
-      int found = 0;
-      for (int c = i + 1; c <= last; c++) {
-        if (is_flash[c] || is_flash[c - 1] || is_flash[c - 2]) continue;
-        found = 1;
-        noise_arr[i] = noise_arr[c];
-        break;
-      }
-      if (found) continue;
-      for (int c = i - 1; c >= start + 2; c--) {
-        if (is_flash[c] || is_flash[c - 1] || is_flash[c - 2]) continue;
-        found = 1;
-        noise_arr[i] = noise_arr[c];
-        break;
-      }
-      if (found) continue;
-      noise_arr[i] = 0;
-    }
-  }
-
-  // if we are at the first 2 frames, copy the noise
-  for (int i = start; i < AOMMAX(start, 2); i++) {
-    noise_arr[i] = noise_arr[2];
-  }
-}
-
-// Estimate correlation coefficient of each frame with its previous frame.
-static void estimate_coeff(const FIRSTPASS_STATS *stats, int start, int last,
-                           double *noise, double *coeff) {
-  for (int i = start; i <= last; i++) {
-    const double C =
-        sqrt(AOMMAX(stats[i - 1].intra_error *
-                        (stats[i].intra_error - stats[i].coded_error),
-                    0.001));
-    const double cor_coeff =
-        C / AOMMAX(stats[i - 1].intra_error - noise[i], 0.001);
-
-    coeff[i] =
-        cor_coeff * sqrt(AOMMAX(stats[i - 1].intra_error - noise[i], 0.001) /
-                         AOMMAX(stats[i].intra_error - noise[i], 0.001));
-    // clip correlation coefficient.
-    coeff[i] = AOMMIN(AOMMAX(coeff[i], 0), 1);
-  }
-}
-
 // Get the average of stats inside a region.
-// Before calling this function, the region's noise variance and correlation
-// coefficients are needed.
 static void analyze_region(const FIRSTPASS_STATS *stats, int k,
-                           REGIONS *regions, double *coeff, double *noise) {
+                           REGIONS *regions) {
   int i;
   regions[k].avg_cor_coeff = 0;
   regions[k].avg_sr_fr_ratio = 0;
@@ -1353,25 +1271,26 @@
         stats[i].coded_error / (double)(regions[k].last - regions[k].start + 1);
 
     regions[k].avg_cor_coeff +=
-        coeff[i] / (double)(regions[k].last - regions[k].start + 1);
+        AOMMAX(stats[i].cor_coeff, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
     regions[k].avg_noise_var +=
-        noise[i] / (double)(regions[k].last - regions[k].start + 1);
+        AOMMAX(stats[i].noise_var, 0.001) /
+        (double)(regions[k].last - regions[k].start + 1);
   }
 }
 
 // Calculate the regions stats of every region.
 static void get_region_stats(const FIRSTPASS_STATS *stats, REGIONS *regions,
-                             double *coeff, double *noise, int num_regions) {
+                             int num_regions) {
   for (int k = 0; k < num_regions; k++) {
-    analyze_region(stats, k, regions, coeff, noise);
+    analyze_region(stats, k, regions);
   }
 }
 
 // Find tentative stable regions
 static int find_stable_regions(const FIRSTPASS_STATS *stats,
-                               const double *grad_coded, const int *ignore,
-                               int this_start, int this_last,
-                               REGIONS *regions) {
+                               const double *grad_coded, int this_start,
+                               int this_last, REGIONS *regions) {
   int i, j, k = 0;
   regions[k].start = this_start;
   for (i = this_start; i <= this_last; i++) {
@@ -1381,7 +1300,7 @@
     int count = 0;
     for (j = -HALF_WIN; j <= HALF_WIN; j++) {
       int idx = AOMMIN(AOMMAX(i + j, this_start), this_last);
-      if (ignore[idx] || (idx > 0 && ignore[idx - 1])) continue;
+      if (stats[idx].is_flash || (idx > 0 && stats[idx - 1].is_flash)) continue;
       mean_intra += stats[idx].intra_error;
       var_intra += stats[idx].intra_error * stats[idx].intra_error;
       mean_coded += stats[idx].coded_error;
@@ -1455,14 +1374,13 @@
 }
 
 static void adjust_unstable_region_bounds(const FIRSTPASS_STATS *stats,
-                                          REGIONS *regions, double *coeff,
-                                          double *noise, int *num_regions) {
+                                          REGIONS *regions, int *num_regions) {
   int i, j, k;
   // Remove regions that are too short. Likely noise.
   remove_short_regions(regions, num_regions, STABLE_REGION, HALF_WIN);
   remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
 
-  get_region_stats(stats, regions, coeff, noise, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // Adjust region boundaries. The thresholds are empirically obtained, but
   // overall the performance is not very sensitive to small changes to them.
@@ -1488,7 +1406,7 @@
           const int intra_close =
               fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
           const int coded_small = stats[j].coded_error / avg_intra_err < 0.1;
-          const int coeff_close = coeff[j] > 0.995;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
           if (!coeff_close || !coded_small) count_coded--;
           if (intra_close && count_coded >= 0 && count_grad >= 0) {
             // this frame probably belongs to the previous stable region
@@ -1521,7 +1439,7 @@
               fabs(stats[j].intra_error - avg_intra_err) / avg_intra_err < 0.1;
           const int coded_small =
               stats[j + 1].coded_error / avg_intra_err < 0.1;
-          const int coeff_close = coeff[j] > 0.995;
+          const int coeff_close = stats[j].cor_coeff > 0.995;
           if (!coeff_close || !coded_small) count_coded--;
           if (intra_close && count_coded >= 0 && count_grad >= 0) {
             // this frame probably belongs to the next stable region
@@ -1537,7 +1455,7 @@
 
   cleanup_regions(regions, num_regions);
   remove_short_regions(regions, num_regions, HIGH_VAR_REGION, HALF_WIN);
-  get_region_stats(stats, regions, coeff, noise, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // If a stable regions has higher error than neighboring high var regions,
   // or if the stable region has a lower average correlation,
@@ -1554,7 +1472,7 @@
            regions[k].avg_cor_coeff < regions[k + 1].avg_cor_coeff * 0.999)))) {
       // merge current region with the previous and next regions
       remove_region(2, regions, num_regions, &k);
-      analyze_region(stats, k - 1, regions, coeff, noise);
+      analyze_region(stats, k - 1, regions);
     } else if (regions[k].type == HIGH_VAR_REGION &&
                (regions[k].last - regions[k].start + 1) < 2 * WINDOW_SIZE &&
                ((k > 0 &&  // previous regions
@@ -1569,7 +1487,7 @@
                       regions[k + 1].avg_cor_coeff * 1.001)))) {
       // merge current region with the previous and next regions
       remove_region(2, regions, num_regions, &k);
-      analyze_region(stats, k - 1, regions, coeff, noise);
+      analyze_region(stats, k - 1, regions);
     } else {
       k++;
     }
@@ -1581,9 +1499,7 @@
 
 // Identify blending regions.
 static void find_blending_regions(const FIRSTPASS_STATS *stats,
-                                  const int *is_flash, REGIONS *regions,
-                                  int *num_regions, double *coeff,
-                                  double *noise) {
+                                  REGIONS *regions, int *num_regions) {
   int i, k = 0;
   // Blending regions will have large content change, therefore will have a
   // large consistent change in intra error.
@@ -1598,7 +1514,8 @@
     int start = 0, last;
     for (i = regions[k].start; i <= regions[k].last; i++) {
       // First mark the regions that has consistent large change of intra error.
-      if (is_flash[i] || (i > 0 && is_flash[i - 1])) continue;
+      if (k == 0 && i == regions[k].start) continue;
+      if (stats[i].is_flash || (i > 0 && stats[i - 1].is_flash)) continue;
       double grad = stats[i].intra_error - stats[i - 1].intra_error;
       int large_change = fabs(grad) / AOMMAX(stats[i].intra_error, 0.01) > 0.05;
       int this_dir = 0;
@@ -1613,7 +1530,11 @@
         insert_region(start, last, BLENDING_REGION, regions, num_regions, &k);
       }
       dir = this_dir;
-      start = i;
+      if (k == 0 && i == regions[k].start + 1) {
+        start = i - 1;
+      } else {
+        start = i;
+      }
     }
     if (dir != 0) {
       last = regions[k].last;
@@ -1624,14 +1545,14 @@
 
   // If the blending region has very low correlation, mark it as high variance
   // since we probably cannot benefit from it anyways.
-  get_region_stats(stats, regions, coeff, noise, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
   for (k = 0; k < *num_regions; k++) {
     if (regions[k].type != BLENDING_REGION) continue;
     if (regions[k].last == regions[k].start || regions[k].avg_cor_coeff < 0.6 ||
         count_stable == 0)
       regions[k].type = HIGH_VAR_REGION;
   }
-  get_region_stats(stats, regions, coeff, noise, *num_regions);
+  get_region_stats(stats, regions, *num_regions);
 
   // It is possible for blending to result in a "dip" in intra error (first
   // decrease then increase). Therefore we need to find the dip and combine the
@@ -1660,7 +1581,7 @@
           if (regions[k].avg_sr_fr_ratio > ratio_thres) {
             regions[k].type = BLENDING_REGION;
             remove_region(2, regions, num_regions, &k);
-            analyze_region(stats, k - 1, regions, coeff, noise);
+            analyze_region(stats, k - 1, regions);
             continue;
           }
         }
@@ -1718,7 +1639,7 @@
 
       if (to_merge) {
         remove_region(0, regions, num_regions, &k);
-        analyze_region(stats, k - 1, regions, coeff, noise);
+        analyze_region(stats, k - 1, regions);
         continue;
       } else {
         // These are possibly two separate blending regions. Mark the boundary
@@ -1726,9 +1647,9 @@
         int prev_k = k - 1;
         insert_region(regions[prev_k].last, regions[prev_k].last,
                       HIGH_VAR_REGION, regions, num_regions, &prev_k);
-        analyze_region(stats, prev_k, regions, coeff, noise);
+        analyze_region(stats, prev_k, regions);
         k = prev_k + 1;
-        analyze_region(stats, k, regions, coeff, noise);
+        analyze_region(stats, k, regions);
       }
     }
     k++;
@@ -1784,56 +1705,42 @@
 // pointing to.
 static void identify_regions(const FIRSTPASS_STATS *const stats_start,
                              int total_frames, int offset, REGIONS *regions,
-                             int *total_regions, double *cor_coeff,
-                             double *noise_var) {
+                             int *total_regions) {
   int k;
   if (total_frames <= 1) return;
 
-  double *coeff = cor_coeff + offset;
-  double *noise = noise_var + offset;
-
   // store the initial decisions
   REGIONS temp_regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
   av1_zero_array(temp_regions, MAX_FIRSTPASS_ANALYSIS_FRAMES);
-  int is_flash[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
   // buffers for filtered stats
   double filt_intra_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
   double filt_coded_err[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
   double grad_coded[MAX_FIRSTPASS_ANALYSIS_FRAMES] = { 0 };
 
-  int cur_region = 0, this_start = 0, this_last = total_frames - 1;
+  int cur_region = 0, this_start = 0, this_last;
 
-  // find possible flash frames
-  mark_flashes(stats_start, 0, total_frames - 1, is_flash);
-
-  // first get the obvious scenecuts
   int next_scenecut = -1;
-
-  estimate_noise(stats_start, is_flash, this_start, this_last, noise);
-  estimate_coeff(stats_start, this_start, this_last, noise, coeff);
-
   do {
+    // first get the obvious scenecuts
     next_scenecut =
-        find_next_scenecut(stats_start, this_start, total_frames - 1, is_flash);
+        find_next_scenecut(stats_start, this_start, total_frames - 1);
     this_last = (next_scenecut >= 0) ? (next_scenecut - 1) : total_frames - 1;
 
     // low-pass filter the needed stats
-    smooth_filter_stats(stats_start, is_flash, this_start, this_last,
-                        filt_intra_err, filt_coded_err);
+    smooth_filter_stats(stats_start, this_start, this_last, filt_intra_err,
+                        filt_coded_err);
     get_gradient(filt_coded_err, this_start, this_last, grad_coded);
 
     // find tentative stable regions and unstable regions
-    int num_regions = find_stable_regions(stats_start, grad_coded, is_flash,
-                                          this_start, this_last, temp_regions);
+    int num_regions = find_stable_regions(stats_start, grad_coded, this_start,
+                                          this_last, temp_regions);
 
-    adjust_unstable_region_bounds(stats_start, temp_regions, coeff, noise,
-                                  &num_regions);
+    adjust_unstable_region_bounds(stats_start, temp_regions, &num_regions);
 
-    get_region_stats(stats_start, temp_regions, coeff, noise, num_regions);
+    get_region_stats(stats_start, temp_regions, num_regions);
 
     // Try to identify blending regions in the unstable regions
-    find_blending_regions(stats_start, is_flash, temp_regions, &num_regions,
-                          coeff, noise);
+    find_blending_regions(stats_start, temp_regions, &num_regions);
     cleanup_blendings(temp_regions, &num_regions);
 
     // The flash points should all be considered high variance points
@@ -1846,7 +1753,7 @@
       int start = temp_regions[k].start;
       int last = temp_regions[k].last;
       for (int i = start; i <= last; i++) {
-        if (is_flash[i]) {
+        if (stats_start[i].is_flash) {
           insert_region(i, i, HIGH_VAR_REGION, temp_regions, &num_regions, &k);
         }
       }
@@ -1877,20 +1784,21 @@
   } while (next_scenecut >= 0);
 
   *total_regions = cur_region;
-  get_region_stats(stats_start, regions, coeff, noise, *total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
 
   for (k = 0; k < *total_regions; k++) {
     // If scenecuts are very minor, mark them as high variance.
     if (regions[k].type != SCENECUT_REGION ||
         regions[k].avg_cor_coeff *
-                (1 - noise[regions[k].start] / regions[k].avg_intra_err) <
+                (1 - stats_start[regions[k].start].noise_var /
+                         regions[k].avg_intra_err) <
             0.8) {
       continue;
     }
     regions[k].type = HIGH_VAR_REGION;
   }
   cleanup_regions(regions, total_regions);
-  get_region_stats(stats_start, regions, coeff, noise, *total_regions);
+  get_region_stats(stats_start, regions, *total_regions);
 
   for (k = 0; k < *total_regions; k++) {
     regions[k].start += offset;
@@ -1917,28 +1825,31 @@
  * \param[in]    max_gop_length   Maximum length of the GF group
  * \param[in]    max_intervals    Maximum number of intervals to decide
  *
- * \return Nothing is returned. Instead, cpi->rc.gf_intervals is
+ * \return Nothing is returned. Instead, cpi->ppi->rc.gf_intervals is
  * changed to store the decided GF group lengths.
  */
 static void calculate_gf_length(AV1_COMP *cpi, int max_gop_length,
                                 int max_intervals) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS next_frame;
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  FRAME_INFO *frame_info = &cpi->frame_info;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  const FIRSTPASS_STATS *const stats = start_pos - (rc->frames_since_key == 0);
+
+  const int f_w = cpi->common.width;
+  const int f_h = cpi->common.height;
   int i;
 
   int flash_detected;
 
-  aom_clear_system_state();
   av1_zero(next_frame);
 
   if (has_no_stats_stage(cpi)) {
     for (i = 0; i < MAX_NUM_GF_INTERVALS; i++) {
-      rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
+      p_rc->gf_intervals[i] = AOMMIN(rc->max_gf_interval, max_gop_length);
     }
-    rc->cur_gf_index = 0;
+    p_rc->cur_gf_index = 0;
     rc->intervals_till_gf_calculate_due = MAX_NUM_GF_INTERVALS;
     return;
   }
@@ -1950,33 +1861,33 @@
   const int min_shrink_int = AOMMAX(MIN_SHRINK_LEN, active_min_gf_interval);
 
   i = (rc->frames_since_key == 0);
-  max_intervals = cpi->lap_enabled ? 1 : max_intervals;
+  max_intervals = cpi->ppi->lap_enabled ? 1 : max_intervals;
   int count_cuts = 1;
   // If cpi->gf_state.arf_gf_boost_lst is 0, we are starting with a KF or GF.
-  int cur_start = -1 + !cpi->gf_state.arf_gf_boost_lst, cur_last;
+  int cur_start = -1 + !cpi->ppi->gf_state.arf_gf_boost_lst, cur_last;
   int cut_pos[MAX_NUM_GF_INTERVALS + 1] = { -1 };
   int cut_here;
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
   while (count_cuts < max_intervals + 1) {
     // reaches next key frame, break here
-    if (i >= rc->frames_to_key + rc->next_is_fwd_key) {
+    if (i >= rc->frames_to_key) {
       cut_here = 2;
     } else if (i - cur_start >= rc->static_scene_max_gf_interval) {
       // reached maximum len, but nothing special yet (almost static)
       // let's look at the next interval
       cut_here = 1;
-    } else if (EOF == input_stats(twopass, &next_frame)) {
+    } else if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) {
       // reaches last frame, break
       cut_here = 2;
     } else {
       // Test for the case where there is a brief flash but the prediction
       // quality back to an earlier frame is then restored.
-      flash_detected = detect_flash(twopass, 0);
+      flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
       // TODO(bohanli): remove redundant accumulations here, or unify
       // this and the ones in define_gf_group
-      accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
-                                  rc->frames_since_key, i, &gf_stats);
+      accumulate_next_frame_stats(&next_frame, flash_detected,
+                                  rc->frames_since_key, i, &gf_stats, f_w, f_h);
 
       cut_here = detect_gf_cut(cpi, i, cur_start, flash_detected,
                                active_max_gf_interval, active_min_gf_interval,
@@ -1987,16 +1898,9 @@
       int ori_last = cur_last;
       // The region frame idx does not start from the same frame as cur_start
       // and cur_last. Need to offset them.
-      int offset = rc->frames_since_key - rc->regions_offset;
-      REGIONS *regions = rc->regions;
-      int num_regions = rc->num_regions;
-      if (cpi->oxcf.kf_cfg.fwd_kf_enabled && rc->next_is_fwd_key) {
-        const int frames_left = rc->frames_to_key - i;
-        const int min_int = AOMMIN(MIN_FWD_KF_INTERVAL, active_min_gf_interval);
-        if (frames_left < min_int && frames_left > 0) {
-          cur_last = rc->frames_to_key - min_int - 1;
-        }
-      }
+      int offset = rc->frames_since_key - p_rc->regions_offset;
+      REGIONS *regions = p_rc->regions;
+      int num_regions = p_rc->num_regions;
 
       int scenecut_idx = -1;
       // only try shrinking if interval smaller than active_max_gf_interval
@@ -2029,7 +1933,7 @@
           // the next gop start from the scenecut with GF
           int is_minor_sc =
               (regions[scenecut_idx].avg_cor_coeff *
-                   (1 - rc->noise_var[regions[scenecut_idx].start] /
+                   (1 - stats[regions[scenecut_idx].start - offset].noise_var /
                             regions[scenecut_idx].avg_intra_err) >
                0.6);
           cur_last = regions[scenecut_idx].last - offset - !is_minor_sc;
@@ -2051,14 +1955,17 @@
             double base_score = 0.0;
             // Accumulate base_score in
             for (int j = cur_start + 1; j < cur_start + min_shrink_int; j++) {
-              base_score = (base_score + 1.0) * rc->cor_coeff[j + offset];
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
             }
             int met_blending = 0;   // Whether we have met blending areas before
             int last_blending = 0;  // Whether the previous frame if blending
             for (int j = cur_start + min_shrink_int; j <= cur_last; j++) {
-              base_score = (base_score + 1.0) * rc->cor_coeff[j + offset];
+              if (stats + j >= twopass->stats_buf_ctx->stats_in_end) break;
+              base_score = (base_score + 1.0) * stats[j].cor_coeff;
               int this_reg =
                   find_regions_index(regions, num_regions, j + offset);
+              if (this_reg < 0) continue;
               // A GOP should include at most 1 blending region.
               if (regions[this_reg].type == BLENDING_REGION) {
                 last_blending = 1;
@@ -2080,20 +1987,23 @@
               // following frames
               int count_f = 0;
               for (int n = j + 1; n <= j + 3 && n <= last_frame; n++) {
-                temp_accu_coeff *= rc->cor_coeff[n + offset];
+                if (stats + n >= twopass->stats_buf_ctx->stats_in_end) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
                 this_score +=
-                    temp_accu_coeff * (1 - rc->noise_var[n + offset] /
-                                               regions[this_reg].avg_intra_err);
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
                 count_f++;
               }
               // preceding frames
               temp_accu_coeff = 1.0;
-              for (int n = j; n > j - 3 * 2 + count_f && n >= first_frame;
-                   n--) {
-                temp_accu_coeff *= rc->cor_coeff[n + offset];
+              for (int n = j; n > j - 3 * 2 + count_f && n > first_frame; n--) {
+                if (stats + n < twopass->stats_buf_ctx->stats_in_start) break;
+                temp_accu_coeff *= stats[n].cor_coeff;
                 this_score +=
-                    temp_accu_coeff * (1 - rc->noise_var[n + offset] /
-                                               regions[this_reg].avg_intra_err);
+                    temp_accu_coeff *
+                    (1 - stats[n].noise_var /
+                             AOMMAX(regions[this_reg].avg_intra_err, 0.001));
               }
 
               if (this_score > best_score) {
@@ -2129,7 +2039,7 @@
       count_cuts++;
 
       // reset pointers to the shrinked location
-      twopass->stats_in = start_pos + cur_last;
+      cpi->twopass_frame.stats_in = start_pos + cur_last;
       cur_start = cur_last;
       int cur_region_idx =
           find_regions_index(regions, num_regions, cur_start + 1 + offset);
@@ -2149,10 +2059,10 @@
   // save intervals
   rc->intervals_till_gf_calculate_due = count_cuts - 1;
   for (int n = 1; n < count_cuts; n++) {
-    rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
+    p_rc->gf_intervals[n - 1] = cut_pos[n] - cut_pos[n - 1];
   }
-  rc->cur_gf_index = 0;
-  twopass->stats_in = start_pos;
+  p_rc->cur_gf_index = 0;
+  cpi->twopass_frame.stats_in = start_pos;
 }
 
 static void correct_frames_to_key(AV1_COMP *cpi) {
@@ -2160,12 +2070,14 @@
       (int)av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
   if (lookahead_size <
       av1_lookahead_pop_sz(cpi->ppi->lookahead, cpi->compressor_stage)) {
-    assert(IMPLIES(cpi->oxcf.pass != 0 && cpi->frames_left > 0,
-                   lookahead_size == cpi->frames_left));
+    assert(
+        IMPLIES(cpi->oxcf.pass != AOM_RC_ONE_PASS && cpi->ppi->frames_left > 0,
+                lookahead_size == cpi->ppi->frames_left));
     cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, lookahead_size);
-  } else if (cpi->frames_left > 0) {
+  } else if (cpi->ppi->frames_left > 0) {
     // Correct frames to key based on limit
-    cpi->rc.frames_to_key = AOMMIN(cpi->rc.frames_to_key, cpi->frames_left);
+    cpi->rc.frames_to_key =
+        AOMMIN(cpi->rc.frames_to_key, cpi->ppi->frames_left);
   }
 }
 
@@ -2179,11 +2091,12 @@
  *
  * \param[in]    cpi             Top-level encoder structure
  *
- * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
 static void define_gf_group_pass0(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   int target;
@@ -2191,28 +2104,28 @@
   if (oxcf->q_cfg.aq_mode == CYCLIC_REFRESH_AQ) {
     av1_cyclic_refresh_set_golden_update(cpi);
   } else {
-    rc->baseline_gf_interval = rc->gf_intervals[rc->cur_gf_index];
+    p_rc->baseline_gf_interval = p_rc->gf_intervals[p_rc->cur_gf_index];
     rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+    p_rc->cur_gf_index++;
   }
 
   // correct frames_to_key when lookahead queue is flushing
   correct_frames_to_key(cpi);
 
-  if (rc->baseline_gf_interval > rc->frames_to_key)
-    rc->baseline_gf_interval = rc->frames_to_key;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
 
-  rc->gfu_boost = DEFAULT_GF_BOOST;
-  rc->constrained_gf_group =
-      (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
 
   gf_group->max_layer_depth_allowed = oxcf->gf_cfg.gf_max_pyr_height;
 
   // Rare case when the look-ahead is less than the target GOP length, can't
   // generate ARF frame.
-  if (rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
+  if (p_rc->baseline_gf_interval > gf_cfg->lag_in_frames ||
       !is_altref_enabled(gf_cfg->lag_in_frames, gf_cfg->enable_auto_arf) ||
-      rc->baseline_gf_interval < rc->min_gf_interval)
+      p_rc->baseline_gf_interval < rc->min_gf_interval)
     gf_group->max_layer_depth_allowed = 0;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
@@ -2239,42 +2152,9 @@
   }
 }
 
-static INLINE void set_baseline_gf_interval(AV1_COMP *cpi, int arf_position,
-                                            int active_max_gf_interval,
-                                            int use_alt_ref,
-                                            int is_final_pass) {
-  RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  // Set the interval until the next gf.
-  // If forward keyframes are enabled, ensure the final gf group obeys the
-  // MIN_FWD_KF_INTERVAL.
-  const int is_last_kf =
-      (twopass->stats_in - arf_position + rc->frames_to_key) >=
-      twopass->stats_buf_ctx->stats_in_end;
-
-  if (cpi->oxcf.kf_cfg.fwd_kf_enabled && use_alt_ref && !is_last_kf &&
-      cpi->rc.next_is_fwd_key) {
-    if (arf_position == rc->frames_to_key + 1) {
-      rc->baseline_gf_interval = arf_position;
-      // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
-    } else if (rc->frames_to_key + 1 - arf_position <
-               AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) {
-      // if possible, merge the last two gf groups
-      if (rc->frames_to_key + 1 <= active_max_gf_interval) {
-        rc->baseline_gf_interval = rc->frames_to_key + 1;
-        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
-        // if merging the last two gf groups creates a group that is too long,
-        // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
-      } else {
-        rc->baseline_gf_interval = rc->frames_to_key + 1 - MIN_FWD_KF_INTERVAL;
-        if (is_final_pass) rc->intervals_till_gf_calculate_due = 0;
-      }
-    } else {
-      rc->baseline_gf_interval = arf_position;
-    }
-  } else {
-    rc->baseline_gf_interval = arf_position;
-  }
+static INLINE void set_baseline_gf_interval(PRIMARY_RATE_CONTROL *p_rc,
+                                            int arf_position) {
+  p_rc->baseline_gf_interval = arf_position;
 }
 
 // initialize GF_GROUP_STATS
@@ -2294,10 +2174,7 @@
   gf_stats->abs_mv_in_out_accumulator = 0.0;
 
   gf_stats->avg_sr_coded_error = 0.0;
-  gf_stats->avg_tr_coded_error = 0.0;
   gf_stats->avg_pcnt_second_ref = 0.0;
-  gf_stats->avg_pcnt_third_ref = 0.0;
-  gf_stats->avg_pcnt_third_ref_nolast = 0.0;
   gf_stats->avg_new_mv_count = 0.0;
   gf_stats->avg_wavelet_energy = 0.0;
   gf_stats->avg_raw_err_stdev = 0.0;
@@ -2313,27 +2190,27 @@
  * parameters regarding bit-allocation and quality setup.
  *
  * \param[in]    cpi             Top-level encoder structure
- * \param[in]    this_frame      First pass statistics structure
  * \param[in]    frame_params    Structure with frame parameters
- * \param[in]    max_gop_length  Maximum length of the GF group
  * \param[in]    is_final_pass   Whether this is the final pass for the
  *                               GF group, or a trial (non-zero)
  *
- * \return Nothing is returned. Instead, cpi->gf_group is changed.
+ * \return Nothing is returned. Instead, cpi->ppi->gf_group is changed.
  */
-static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                            EncodeFrameParams *frame_params, int max_gop_length,
+static void define_gf_group(AV1_COMP *cpi, EncodeFrameParams *frame_params,
                             int is_final_pass) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS next_frame;
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
-  GF_GROUP *gf_group = &cpi->gf_group;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *frame_info = &cpi->frame_info;
   const GFConfig *const gf_cfg = &oxcf->gf_cfg;
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+  const int f_w = cm->width;
+  const int f_h = cm->height;
   int i;
   int flash_detected;
   int64_t gf_group_bits;
@@ -2344,11 +2221,10 @@
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (!is_intra_only) {
-    av1_zero(cpi->gf_group);
+    av1_zero(cpi->ppi->gf_group);
     cpi->gf_frame_index = 0;
   }
 
-  aom_clear_system_state();
   av1_zero(next_frame);
 
   if (has_no_stats_stage(cpi)) {
@@ -2357,96 +2233,62 @@
   }
 
   // correct frames_to_key when lookahead queue is emptying
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     correct_frames_to_key(cpi);
   }
 
   GF_GROUP_STATS gf_stats;
   init_gf_stats(&gf_stats);
-  GF_FRAME_STATS first_frame_stats, last_frame_stats;
 
   const int can_disable_arf = !gf_cfg->gf_min_pyr_height;
 
-  // Load stats for the current frame.
-  double mod_frame_err =
-      calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-
-  // Note the error of the frame at the start of the group. This will be
-  // the GF frame error if we code a normal gf.
-  first_frame_stats.frame_err = mod_frame_err;
-  first_frame_stats.frame_coded_error = this_frame->coded_error;
-  first_frame_stats.frame_sr_coded_error = this_frame->sr_coded_error;
-  first_frame_stats.frame_tr_coded_error = this_frame->tr_coded_error;
-
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-
-  // TODO(urvang): Try logic to vary min and max interval based on q.
   const int active_min_gf_interval = rc->min_gf_interval;
-  const int active_max_gf_interval =
-      AOMMIN(rc->max_gf_interval, max_gop_length);
 
   i = is_intra_only;
-  // get the determined gf group length from rc->gf_intervals
-  while (i < rc->gf_intervals[rc->cur_gf_index]) {
+  // get the determined gf group length from p_rc->gf_intervals
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
     // read in the next frame
-    if (EOF == input_stats(twopass, &next_frame)) break;
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) break;
     // Accumulate error score of frames in this gf group.
-    mod_frame_err =
+    double mod_frame_err =
         calculate_modified_err(frame_info, twopass, oxcf, &next_frame);
     // accumulate stats for this frame
     accumulate_this_frame_stats(&next_frame, mod_frame_err, &gf_stats);
-
-    if (i == 0) {
-      first_frame_stats.frame_err = mod_frame_err;
-      first_frame_stats.frame_coded_error = next_frame.coded_error;
-      first_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
-      first_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
-    }
-
     ++i;
   }
 
-  reset_fpf_position(twopass, start_pos);
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
 
   i = is_intra_only;
-  input_stats(twopass, &next_frame);
-  while (i < rc->gf_intervals[rc->cur_gf_index]) {
+  input_stats(twopass, &cpi->twopass_frame, &next_frame);
+  while (i < p_rc->gf_intervals[p_rc->cur_gf_index]) {
     // read in the next frame
-    if (EOF == input_stats(twopass, &next_frame)) break;
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, &next_frame)) break;
 
     // Test for the case where there is a brief flash but the prediction
     // quality back to an earlier frame is then restored.
-    flash_detected = detect_flash(twopass, 0);
+    flash_detected = detect_flash(twopass, &cpi->twopass_frame, 0);
 
     // accumulate stats for next frame
-    accumulate_next_frame_stats(&next_frame, frame_info, flash_detected,
-                                rc->frames_since_key, i, &gf_stats);
+    accumulate_next_frame_stats(&next_frame, flash_detected,
+                                rc->frames_since_key, i, &gf_stats, f_w, f_h);
 
     ++i;
   }
 
-  i = rc->gf_intervals[rc->cur_gf_index];
-
-  // save the errs for the last frame
-  last_frame_stats.frame_coded_error = next_frame.coded_error;
-  last_frame_stats.frame_sr_coded_error = next_frame.sr_coded_error;
-  last_frame_stats.frame_tr_coded_error = next_frame.tr_coded_error;
+  i = p_rc->gf_intervals[p_rc->cur_gf_index];
 
   if (is_final_pass) {
     rc->intervals_till_gf_calculate_due--;
-    rc->cur_gf_index++;
+    p_rc->cur_gf_index++;
   }
 
   // Was the group length constrained by the requirement for a new KF?
-  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+  p_rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
-  const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
-                          ? cpi->initial_mbs
-                          : cm->mi_params.MBs;
-  assert(num_mbs > 0);
-
-  average_gf_stats(i, &next_frame, &gf_stats);
+  average_gf_stats(i, &gf_stats);
 
   // Disable internal ARFs for "still" gf groups.
   //   zero_motion_accumulator: minimum percentage of (0,0) motion;
@@ -2456,7 +2298,7 @@
   const int can_disable_internal_arfs = gf_cfg->gf_min_pyr_height <= 1;
   if (can_disable_internal_arfs &&
       gf_stats.zero_motion_accumulator > MIN_ZERO_MOTION &&
-      gf_stats.avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+      gf_stats.avg_sr_coded_error < MAX_SR_CODED_ERROR &&
       gf_stats.avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
     cpi->ppi->internal_altref_allowed = 0;
   }
@@ -2465,25 +2307,12 @@
   if (can_disable_arf) {
     use_alt_ref =
         !is_almost_static(gf_stats.zero_motion_accumulator,
-                          twopass->kf_zeromotion_pct, cpi->lap_enabled) &&
-        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
+                          twopass->kf_zeromotion_pct, cpi->ppi->lap_enabled) &&
+        p_rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) &&
         (i >= MIN_GF_INTERVAL);
-
-    // TODO(urvang): Improve and use model for VBR, CQ etc as well.
-    if (use_alt_ref && rc_cfg->mode == AOM_Q && rc_cfg->cq_level <= 200) {
-      aom_clear_system_state();
-      float features[21];
-      get_features_from_gf_stats(
-          &gf_stats, &first_frame_stats, &last_frame_stats, num_mbs,
-          rc->constrained_gf_group, twopass->kf_zeromotion_pct, i, features);
-      // Infer using ML model.
-      float score;
-      av1_nn_predict(features, &av1_use_flat_gop_nn_config, 1, &score);
-      use_alt_ref = (score <= 0.0);
-    }
   } else {
-    use_alt_ref =
-        rc->use_arf_in_this_kf_group && (i < gf_cfg->lag_in_frames) && (i > 2);
+    use_alt_ref = p_rc->use_arf_in_this_kf_group &&
+                  (i < gf_cfg->lag_in_frames) && (i > 2);
   }
 
 #define REDUCE_GF_LENGTH_THRESH 4
@@ -2528,56 +2357,54 @@
   int ext_len = i - is_intra_only;
   if (use_alt_ref) {
     gf_group->max_layer_depth_allowed = gf_cfg->gf_max_pyr_height;
-    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
-                             is_final_pass);
+    set_baseline_gf_interval(&cpi->ppi->p_rc, i);
 
     const int forward_frames = (rc->frames_to_key - i >= ext_len)
                                    ? ext_len
                                    : AOMMAX(0, rc->frames_to_key - i);
 
     // Calculate the boost for alt ref.
-    rc->gfu_boost = av1_calc_arf_boost(
-        twopass, rc, frame_info, alt_offset, forward_frames, ext_len,
-        cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-        cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL);
+    p_rc->gfu_boost = av1_calc_arf_boost(
+        twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset,
+        forward_frames, ext_len, &p_rc->num_stats_used_for_gfu_boost,
+        &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled);
   } else {
-    reset_fpf_position(twopass, start_pos);
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
     gf_group->max_layer_depth_allowed = 0;
-    set_baseline_gf_interval(cpi, i, active_max_gf_interval, use_alt_ref,
-                             is_final_pass);
+    set_baseline_gf_interval(&cpi->ppi->p_rc, i);
 
-    rc->gfu_boost = AOMMIN(
+    p_rc->gfu_boost = AOMMIN(
         MAX_GF_BOOST,
         av1_calc_arf_boost(
-            twopass, rc, frame_info, alt_offset, ext_len, 0,
-            cpi->lap_enabled ? &rc->num_stats_used_for_gfu_boost : NULL,
-            cpi->lap_enabled ? &rc->num_stats_required_for_gfu_boost : NULL));
+            twopass, &cpi->twopass_frame, p_rc, frame_info, alt_offset, ext_len,
+            0, &p_rc->num_stats_used_for_gfu_boost,
+            &p_rc->num_stats_required_for_gfu_boost, cpi->ppi->lap_enabled));
   }
 
 #define LAST_ALR_BOOST_FACTOR 0.2f
-  rc->arf_boost_factor = 1.0;
+  p_rc->arf_boost_factor = 1.0;
   if (use_alt_ref && !is_lossless_requested(rc_cfg)) {
     // Reduce the boost of altref in the last gf group
     if (rc->frames_to_key - ext_len == REDUCE_GF_LENGTH_BY ||
         rc->frames_to_key - ext_len == 0) {
-      rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+      p_rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
     }
   }
 
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
 
   // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     // Since we don't have enough stats to know the actual error of the
     // gf group, we assume error of each frame to be equal to 1 and set
     // the error of the group as baseline_gf_interval.
-    gf_stats.gf_group_err = rc->baseline_gf_interval;
+    gf_stats.gf_group_err = p_rc->baseline_gf_interval;
   }
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_stats.gf_group_err);
-  rc->gf_group_bits = gf_group_bits;
+  p_rc->gf_group_bits = gf_group_bits;
 
 #if GROUP_ADAPTIVE_MAXQ
   // Calculate an estimate of the maxq needed for the group.
@@ -2585,17 +2412,17 @@
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
-  if ((rc_cfg->mode != AOM_Q) && (rc->baseline_gf_interval > 1) &&
+  if ((rc_cfg->mode != AOM_Q) && (p_rc->baseline_gf_interval > 1) &&
       is_final_pass) {
     const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
+        (int)(gf_group_bits / p_rc->baseline_gf_interval);
     const double group_av_err =
-        gf_stats.gf_group_raw_error / rc->baseline_gf_interval;
+        gf_stats.gf_group_raw_error / p_rc->baseline_gf_interval;
     const double group_av_skip_pct =
-        gf_stats.gf_group_skip_pct / rc->baseline_gf_interval;
+        gf_stats.gf_group_skip_pct / p_rc->baseline_gf_interval;
     const double group_av_inactive_zone =
         ((gf_stats.gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
+         (p_rc->baseline_gf_interval * (double)cm->mi_params.mb_rows));
 
     int tmp_q;
     tmp_q = get_twopass_worst_quality(
@@ -2606,20 +2433,19 @@
 #endif
 
   // Adjust KF group bits and error remaining.
-  if (is_final_pass)
-    twopass->kf_group_error_left -= (int64_t)gf_stats.gf_group_err;
+  if (is_final_pass) twopass->kf_group_error_left -= gf_stats.gf_group_err;
 
   // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
   av1_gop_setup_structure(cpi);
 
   // Reset the file position.
-  reset_fpf_position(twopass, start_pos);
+  reset_fpf_position(&cpi->twopass_frame, start_pos);
 
   // Calculate a section intra ratio used in setting max loop filter.
   if (rc->frames_since_key != 0) {
     twopass->section_intra_rating = calculate_section_intra_ratio(
         start_pos, twopass->stats_buf_ctx->stats_in_end,
-        rc->baseline_gf_interval);
+        p_rc->baseline_gf_interval);
   }
 
   av1_gop_bit_allocation(cpi, rc, gf_group, rc->frames_since_key == 0,
@@ -2633,12 +2459,17 @@
 
   // TODO(jingning): Generalize this condition.
   if (is_final_pass) {
-    cpi->gf_state.arf_gf_boost_lst = use_alt_ref;
+    cpi->ppi->gf_state.arf_gf_boost_lst = use_alt_ref;
 
     // Reset rolling actual and target bits counters for ARF groups.
     twopass->rolling_arf_group_target_bits = 1;
     twopass->rolling_arf_group_actual_bits = 1;
   }
+#if CONFIG_BITRATE_ACCURACY
+  if (is_final_pass) {
+    vbr_rc_set_gop_bit_budget(&cpi->vbr_rc_info, p_rc->baseline_gf_interval);
+  }
+#endif
 }
 
 // #define FIXED_ARF_BITS
@@ -2648,12 +2479,13 @@
 void av1_gop_bit_allocation(const AV1_COMP *cpi, RATE_CONTROL *const rc,
                             GF_GROUP *gf_group, int is_key_frame, int use_arf,
                             int64_t gf_group_bits) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   // Calculate the extra bits to be used for boosted frame(s)
 #ifdef FIXED_ARF_BITS
   int gf_arf_bits = (int)(ARF_BITS_FRACTION * gf_group_bits);
 #else
   int gf_arf_bits = calculate_boost_bits(
-      rc->baseline_gf_interval - (rc->frames_since_key == 0), rc->gfu_boost,
+      p_rc->baseline_gf_interval - (rc->frames_since_key == 0), p_rc->gfu_boost,
       gf_group_bits);
 #endif
 
@@ -2661,8 +2493,8 @@
                                                    gf_group_bits, 1);
 
   // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(gf_group, rc, gf_group_bits, gf_arf_bits, is_key_frame,
-                         use_arf);
+  allocate_gf_group_bits(gf_group, p_rc, rc, gf_group_bits, gf_arf_bits,
+                         is_key_frame, use_arf);
 }
 
 // Minimum % intra coding observed in first pass (1.0 = 100%)
@@ -2723,92 +2555,100 @@
              second_ref_usage_thresh_max_delta;
 }
 
-static int test_candidate_kf(TWO_PASS *twopass,
-                             const FIRSTPASS_STATS *last_frame,
-                             const FIRSTPASS_STATS *this_frame,
-                             const FIRSTPASS_STATS *next_frame,
-                             int frame_count_so_far, enum aom_rc_mode rc_mode,
-                             int scenecut_mode) {
+static int test_candidate_kf(const FIRSTPASS_INFO *firstpass_info,
+                             int this_stats_index, int frame_count_so_far,
+                             enum aom_rc_mode rc_mode, int scenecut_mode,
+                             int num_mbs) {
+  const FIRSTPASS_STATS *last_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index - 1);
+  const FIRSTPASS_STATS *this_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index);
+  const FIRSTPASS_STATS *next_stats =
+      av1_firstpass_info_peek(firstpass_info, this_stats_index + 1);
+  if (last_stats == NULL || this_stats == NULL || next_stats == NULL) {
+    return 0;
+  }
+
   int is_viable_kf = 0;
-  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double pcnt_intra = 1.0 - this_stats->pcnt_inter;
   double modified_pcnt_inter =
-      this_frame->pcnt_inter - this_frame->pcnt_neutral;
+      this_stats->pcnt_inter - this_stats->pcnt_neutral;
   const double second_ref_usage_thresh =
       get_second_ref_usage_thresh(frame_count_so_far);
-  int total_frames_to_test = SCENE_CUT_KEY_TEST_INTERVAL;
+  int frames_to_test_after_candidate_key = SCENE_CUT_KEY_TEST_INTERVAL;
   int count_for_tolerable_prediction = 3;
-  int num_future_frames = 0;
-  FIRSTPASS_STATS curr_frame;
+
+  // We do "-1" because the candidate key is not counted.
+  int stats_after_this_stats =
+      av1_firstpass_info_future_count(firstpass_info, this_stats_index) - 1;
 
   if (scenecut_mode == ENABLE_SCENECUT_MODE_1) {
-    curr_frame = *this_frame;
-    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-    for (num_future_frames = 0; num_future_frames < SCENE_CUT_KEY_TEST_INTERVAL;
-         num_future_frames++)
-      if (EOF == input_stats(twopass, &curr_frame)) break;
-    reset_fpf_position(twopass, start_position);
-    if (num_future_frames < 3) {
+    if (stats_after_this_stats < 3) {
       return 0;
     } else {
-      total_frames_to_test = 3;
+      frames_to_test_after_candidate_key = 3;
       count_for_tolerable_prediction = 1;
     }
   }
+  // Make sure we have enough stats after the candidate key.
+  frames_to_test_after_candidate_key =
+      AOMMIN(frames_to_test_after_candidate_key, stats_after_this_stats);
 
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
   if (IMPLIES(rc_mode == AOM_Q, frame_count_so_far >= 3) &&
-      (this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
-      (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
-      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       slide_transition(this_frame, last_frame, next_frame) ||
+      (this_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      (next_stats->pcnt_second_ref < second_ref_usage_thresh) &&
+      ((this_stats->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       slide_transition(this_stats, last_stats, next_stats) ||
        ((pcnt_intra > MIN_INTRA_LEVEL) &&
         (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
-        ((this_frame->intra_error /
-          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+        ((this_stats->intra_error /
+          DOUBLE_DIVIDE_CHECK(this_stats->coded_error)) <
          KF_II_ERR_THRESHOLD) &&
-        ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+        ((fabs(last_stats->coded_error - this_stats->coded_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->coded_error) >
           ERR_CHANGE_THRESHOLD) ||
-         (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+         (fabs(last_stats->intra_error - this_stats->intra_error) /
+              DOUBLE_DIVIDE_CHECK(this_stats->intra_error) >
           ERR_CHANGE_THRESHOLD) ||
-         ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+         ((next_stats->intra_error /
+           DOUBLE_DIVIDE_CHECK(next_stats->coded_error)) >
           II_IMPROVEMENT_THRESHOLD))))) {
     int i;
-    const FIRSTPASS_STATS *start_pos = twopass->stats_in;
     double boost_score = 0.0;
     double old_boost_score = 0.0;
     double decay_accumulator = 1.0;
 
     // Examine how well the key frame predicts subsequent frames.
-    for (i = 0; i < total_frames_to_test; ++i) {
+    for (i = 1; i <= frames_to_test_after_candidate_key; ++i) {
       // Get the next frame details
-      FIRSTPASS_STATS local_next_frame;
-      if (EOF == input_stats(twopass, &local_next_frame)) break;
-      double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
-                             DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+      const FIRSTPASS_STATS *local_next_frame =
+          av1_firstpass_info_peek(firstpass_info, this_stats_index + i);
+      double next_iiratio =
+          (BOOST_FACTOR * local_next_frame->intra_error /
+           DOUBLE_DIVIDE_CHECK(local_next_frame->coded_error));
 
       if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
 
       // Cumulative effect of decay in prediction quality.
-      if (local_next_frame.pcnt_inter > 0.85)
-        decay_accumulator *= local_next_frame.pcnt_inter;
+      if (local_next_frame->pcnt_inter > 0.85)
+        decay_accumulator *= local_next_frame->pcnt_inter;
       else
-        decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+        decay_accumulator *= (0.85 + local_next_frame->pcnt_inter) / 2.0;
 
       // Keep a running total.
       boost_score += (decay_accumulator * next_iiratio);
 
       // Test various breakout clauses.
-      if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
-          (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+      // TODO(any): Test of intra error should be normalized to an MB.
+      if ((local_next_frame->pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+          (((local_next_frame->pcnt_inter - local_next_frame->pcnt_neutral) <
             0.20) &&
            (next_iiratio < 3.0)) ||
           ((boost_score - old_boost_score) < 3.0) ||
-          (local_next_frame.intra_error < 200)) {
+          (local_next_frame->intra_error < (200.0 / (double)num_mbs))) {
         break;
       }
 
@@ -2822,9 +2662,6 @@
     } else {
       is_viable_kf = 0;
     }
-
-    // Reset the file position
-    reset_fpf_position(twopass, start_pos);
   }
   return is_viable_kf;
 }
@@ -2837,10 +2674,8 @@
 #define MIN_STATIC_KF_BOOST 5400  // Minimum boost for static KF interval
 
 static int detect_app_forced_key(AV1_COMP *cpi) {
-  if (cpi->oxcf.kf_cfg.fwd_kf_enabled) cpi->rc.next_is_fwd_key = 1;
   int num_frames_to_app_forced_key = is_forced_keyframe_pending(
       cpi->ppi->lookahead, cpi->ppi->lookahead->max_sz, cpi->compressor_stage);
-  if (num_frames_to_app_forced_key != -1) cpi->rc.next_is_fwd_key = 0;
   return num_frames_to_app_forced_key;
 }
 
@@ -2850,16 +2685,16 @@
    * all stats needed for prior boost calculation are available.
    * Hence projecting the prior boost is not needed in this cases.
    */
-  if (cpi->rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
-    return cpi->rc.kf_boost;
+  if (cpi->ppi->p_rc.num_stats_used_for_kf_boost >= cpi->rc.frames_to_key)
+    return cpi->ppi->p_rc.kf_boost;
 
   // Get the current tpl factor (number of frames = frames_to_key).
   double tpl_factor = av1_get_kf_boost_projection_factor(cpi->rc.frames_to_key);
   // Get the tpl factor when number of frames = num_stats_used_for_kf_boost.
-  double tpl_factor_num_stats =
-      av1_get_kf_boost_projection_factor(cpi->rc.num_stats_used_for_kf_boost);
+  double tpl_factor_num_stats = av1_get_kf_boost_projection_factor(
+      cpi->ppi->p_rc.num_stats_used_for_kf_boost);
   int projected_kf_boost =
-      (int)rint((tpl_factor * cpi->rc.kf_boost) / tpl_factor_num_stats);
+      (int)rint((tpl_factor * cpi->ppi->p_rc.kf_boost) / tpl_factor_num_stats);
   return projected_kf_boost;
 }
 
@@ -2870,26 +2705,29 @@
  * scenecut is detected or the maximum key frame distance is reached.
  *
  * \param[in]    cpi              Top-level encoder structure
- * \param[in]    this_frame       Pointer to first pass stats
- * \param[out]   kf_group_err     The total error in the KF group
+ * \param[in]    firstpass_info   struct for firstpass info
  * \param[in]    num_frames_to_detect_scenecut Maximum lookahead frames.
+ * \param[in]    search_start_idx   the start index for searching key frame.
+ *                                  Set it to one if we already know the
+ *                                  current frame is key frame. Otherwise,
+ *                                  set it to zero.
  *
- * \return       Number of frames to the next key.
+ * \return       Number of frames to the next key including the current frame.
  */
-static int define_kf_interval(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
-                              double *kf_group_err,
-                              int num_frames_to_detect_scenecut) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  RATE_CONTROL *const rc = &cpi->rc;
+static int define_kf_interval(AV1_COMP *cpi,
+                              const FIRSTPASS_INFO *firstpass_info,
+                              int num_frames_to_detect_scenecut,
+                              int search_start_idx) {
+  const TWO_PASS *const twopass = &cpi->ppi->twopass;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
-  FIRSTPASS_STATS last_frame;
   double decay_accumulator = 1.0;
   int i = 0, j;
-  int frames_to_key = 1;
+  int frames_to_key = search_start_idx;
   int frames_since_key = rc->frames_since_key + 1;
-  FRAME_INFO *const frame_info = &cpi->frame_info;
   int num_stats_used_for_kf_boost = 1;
   int scenecut_detected = 0;
 
@@ -2910,37 +2748,36 @@
   for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
 
   i = 0;
-  while (twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
+  const int num_mbs = (oxcf->resize_cfg.resize_mode != RESIZE_NONE)
+                          ? cpi->initial_mbs
+                          : cpi->common.mi_params.MBs;
+  const int future_stats_count =
+      av1_firstpass_info_future_count(firstpass_info, 0);
+  while (frames_to_key < future_stats_count &&
          frames_to_key < num_frames_to_detect_scenecut) {
     // Accumulate total number of stats available till next key frame
     num_stats_used_for_kf_boost++;
 
-    // Accumulate kf group error.
-    if (kf_group_err != NULL)
-      *kf_group_err +=
-          calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-
-    // Load the next frame's stats.
-    last_frame = *this_frame;
-    input_stats(twopass, this_frame);
-
     // Provided that we are not at the end of the file...
-    if ((cpi->rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
-        twopass->stats_in < twopass->stats_buf_ctx->stats_in_end) {
+    if ((cpi->ppi->p_rc.enable_scenecut_detection > 0) && kf_cfg->auto_key &&
+        frames_to_key + 1 < future_stats_count) {
       double loop_decay_rate;
 
       // Check for a scene cut.
-      if (frames_since_key >= kf_cfg->key_freq_min &&
-          test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
-                            frames_since_key, oxcf->rc_cfg.mode,
-                            cpi->rc.enable_scenecut_detection)) {
-        scenecut_detected = 1;
-        break;
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = test_candidate_kf(
+            &twopass->firstpass_info, frames_to_key, frames_since_key,
+            oxcf->rc_cfg.mode, cpi->ppi->p_rc.enable_scenecut_detection,
+            num_mbs);
+        if (scenecut_detected) {
+          break;
+        }
       }
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate =
-          get_prediction_decay_rate(frame_info, twopass->stats_in);
+      const FIRSTPASS_STATS *next_stats =
+          av1_firstpass_info_peek(firstpass_info, frames_to_key + 1);
+      loop_decay_rate = get_prediction_decay_rate(next_stats);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -2952,16 +2789,17 @@
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (frames_since_key >= kf_cfg->key_freq_min &&
-          detect_transition_to_still(twopass, rc->min_gf_interval, i,
-                                     kf_cfg->key_freq_max - i, loop_decay_rate,
-                                     decay_accumulator)) {
-        scenecut_detected = 1;
-        // In the case of transition followed by a static scene, the key frame
-        // could be a good predictor for the following frames, therefore we
-        // do not use an arf.
-        rc->use_arf_in_this_kf_group = 0;
-        break;
+      if (frames_since_key >= kf_cfg->key_freq_min) {
+        scenecut_detected = detect_transition_to_still(
+            firstpass_info, frames_to_key + 1, rc->min_gf_interval, i,
+            kf_cfg->key_freq_max - i, loop_decay_rate, decay_accumulator);
+        if (scenecut_detected) {
+          // In the case of transition followed by a static scene, the key frame
+          // could be a good predictor for the following frames, therefore we
+          // do not use an arf.
+          p_rc->use_arf_in_this_kf_group = 0;
+          break;
+        }
       }
 
       // Step on to the next frame.
@@ -2970,28 +2808,23 @@
 
       // If we don't have a real key frame within the next two
       // key_freq_max intervals then break out of the loop.
-      if (frames_to_key >= 2 * kf_cfg->key_freq_max) break;
+      if (frames_to_key >= 2 * kf_cfg->key_freq_max) {
+        break;
+      }
     } else {
       ++frames_to_key;
       ++frames_since_key;
     }
     ++i;
   }
-
-  if (kf_group_err != NULL)
-    rc->num_stats_used_for_kf_boost = num_stats_used_for_kf_boost;
-
-  if (cpi->lap_enabled && !scenecut_detected)
+  if (cpi->ppi->lap_enabled && !scenecut_detected)
     frames_to_key = num_frames_to_next_key;
 
-  if (!kf_cfg->fwd_kf_enabled || scenecut_detected ||
-      twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end)
-    rc->next_is_fwd_key = 0;
-
   return frames_to_key;
 }
 
 static double get_kf_group_avg_error(TWO_PASS *twopass,
+                                     TWO_PASS_FRAME *twopass_frame,
                                      const FIRSTPASS_STATS *first_frame,
                                      const FIRSTPASS_STATS *start_position,
                                      int frames_to_key) {
@@ -2999,11 +2832,11 @@
   int num_frames, i;
   double kf_group_avg_error = 0.0;
 
-  reset_fpf_position(twopass, start_position);
+  reset_fpf_position(twopass_frame, start_position);
 
   for (i = 0; i < frames_to_key; ++i) {
     kf_group_avg_error += cur_frame.coded_error;
-    if (EOF == input_stats(twopass, &cur_frame)) break;
+    if (EOF == input_stats(twopass, twopass_frame, &cur_frame)) break;
   }
   num_frames = i + 1;
   num_frames = AOMMIN(num_frames, frames_to_key);
@@ -3015,19 +2848,14 @@
 static int64_t get_kf_group_bits(AV1_COMP *cpi, double kf_group_err,
                                  double kf_group_avg_error) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   int64_t kf_group_bits;
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     kf_group_bits = (int64_t)rc->frames_to_key * rc->avg_frame_bandwidth;
     if (cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap) {
-      const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
-                              ? cpi->initial_mbs
-                              : cpi->common.mi_params.MBs;
-
       double vbr_corpus_complexity_lap =
           cpi->oxcf.rc_cfg.vbr_corpus_complexity_lap / 10.0;
       /* Get the average corpus complexity of the frame */
-      vbr_corpus_complexity_lap = vbr_corpus_complexity_lap * num_mbs;
       kf_group_bits = (int64_t)(
           kf_group_bits * (kf_group_avg_error / vbr_corpus_complexity_lap));
     }
@@ -3041,13 +2869,13 @@
 
 static int calc_avg_stats(AV1_COMP *cpi, FIRSTPASS_STATS *avg_frame_stat) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS cur_frame;
   av1_zero(cur_frame);
   int num_frames = 0;
   // Accumulate total stat using available number of stats.
   for (num_frames = 0; num_frames < (rc->frames_to_key - 1); ++num_frames) {
-    if (EOF == input_stats(twopass, &cur_frame)) break;
+    if (EOF == input_stats(twopass, &cpi->twopass_frame, &cur_frame)) break;
     av1_accumulate_stats(avg_frame_stat, &cur_frame);
   }
 
@@ -3090,7 +2918,7 @@
                                  double *zero_motion_accumulator,
                                  double *sr_accumulator, int use_avg_stat) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   FIRSTPASS_STATS frame_stat;
   av1_zero(frame_stat);
@@ -3106,14 +2934,15 @@
   if (use_avg_stat) num_stat_used = calc_avg_stats(cpi, &frame_stat);
 
   for (i = num_stat_used; i < (rc->frames_to_key - 1); ++i) {
-    if (!use_avg_stat && EOF == input_stats(twopass, &frame_stat)) break;
+    if (!use_avg_stat &&
+        EOF == input_stats(twopass, &cpi->twopass_frame, &frame_stat))
+      break;
 
     // Monitor for static sections.
     // For the first frame in kf group, the second ref indicator is invalid.
     if (i > 0) {
       *zero_motion_accumulator =
-          AOMMIN(*zero_motion_accumulator,
-                 get_zero_motion_factor(frame_info, &frame_stat));
+          AOMMIN(*zero_motion_accumulator, get_zero_motion_factor(&frame_stat));
     } else {
       *zero_motion_accumulator = frame_stat.pcnt_inter - frame_stat.pcnt_motion;
     }
@@ -3128,8 +2957,9 @@
       zm_factor = (0.75 + (*zero_motion_accumulator / 2.0));
 
       if (i < 2) *sr_accumulator = 0.0;
-      frame_boost = calc_kf_frame_boost(rc, frame_info, &frame_stat,
-                                        sr_accumulator, kf_max_boost);
+      frame_boost =
+          calc_kf_frame_boost(&cpi->ppi->p_rc, frame_info, &frame_stat,
+                              sr_accumulator, kf_max_boost);
       boost_score += frame_boost * zm_factor;
     }
   }
@@ -3153,8 +2983,9 @@
  */
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
@@ -3162,11 +2993,12 @@
   const KeyFrameCfg *const kf_cfg = &oxcf->kf_cfg;
   const FIRSTPASS_STATS first_frame = *this_frame;
   FIRSTPASS_STATS next_frame;
+  const FIRSTPASS_INFO *firstpass_info = &twopass->firstpass_info;
   av1_zero(next_frame);
 
   rc->frames_since_key = 0;
   // Use arfs if possible.
-  rc->use_arf_in_this_kf_group = is_altref_enabled(
+  p_rc->use_arf_in_this_kf_group = is_altref_enabled(
       oxcf->gf_cfg.lag_in_frames, oxcf->gf_cfg.enable_auto_arf);
 
   // Reset the GF group data structures.
@@ -3178,32 +3010,31 @@
 
   if (has_no_stats_stage(cpi)) {
     int num_frames_to_app_forced_key = detect_app_forced_key(cpi);
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         current_frame->frame_number != 0 && rc->frames_to_key == 0;
     if (num_frames_to_app_forced_key != -1)
       rc->frames_to_key = num_frames_to_app_forced_key;
     else
       rc->frames_to_key = AOMMAX(1, kf_cfg->key_freq_max);
     correct_frames_to_key(cpi);
-    rc->kf_boost = DEFAULT_KF_BOOST;
+    p_rc->kf_boost = DEFAULT_KF_BOOST;
     gf_group->update_type[0] = KF_UPDATE;
     return;
   }
   int i;
-  const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+  const FIRSTPASS_STATS *const start_position = cpi->twopass_frame.stats_in;
   int kf_bits = 0;
   double zero_motion_accumulator = 1.0;
   double boost_score = 0.0;
   double kf_raw_err = 0.0;
   double kf_mod_err = 0.0;
-  double kf_group_err = 0.0;
   double sr_accumulator = 0.0;
   double kf_group_avg_error = 0.0;
   int frames_to_key, frames_to_key_clipped = INT_MAX;
   int64_t kf_group_bits_clipped = INT64_MAX;
 
   // Is this a forced key frame by interval.
-  rc->this_key_frame_forced = rc->next_key_frame_forced;
+  p_rc->this_key_frame_forced = p_rc->next_key_frame_forced;
 
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
@@ -3211,15 +3042,20 @@
   kf_raw_err = this_frame->intra_error;
   kf_mod_err = calculate_modified_err(frame_info, twopass, oxcf, this_frame);
 
-  frames_to_key =
-      define_kf_interval(cpi, this_frame, &kf_group_err, kf_cfg->key_freq_max);
+  // We assume the current frame is a key frame and we are looking for the next
+  // key frame. Therefore search_start_idx = 1
+  frames_to_key = define_kf_interval(cpi, firstpass_info, kf_cfg->key_freq_max,
+                                     /*search_start_idx=*/1);
 
-  if (frames_to_key != -1)
+  if (frames_to_key != -1) {
     rc->frames_to_key = AOMMIN(kf_cfg->key_freq_max, frames_to_key);
-  else
+  } else {
     rc->frames_to_key = kf_cfg->key_freq_max;
+  }
 
-  if (cpi->lap_enabled) correct_frames_to_key(cpi);
+  rc->frames_to_fwd_kf = kf_cfg->fwd_kf_dist;
+
+  if (cpi->ppi->lap_enabled) correct_frames_to_key(cpi);
 
   // If there is a max kf interval set by the user we must obey it.
   // We already breakout of the loop above at 2x max.
@@ -3231,38 +3067,38 @@
     rc->frames_to_key /= 2;
 
     // Reset to the start of the group.
-    reset_fpf_position(twopass, start_position);
-
-    kf_group_err = 0.0;
-
+    reset_fpf_position(&cpi->twopass_frame, start_position);
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err +=
-          calculate_modified_err(frame_info, twopass, oxcf, &tmp_frame);
-      if (EOF == input_stats(twopass, &tmp_frame)) break;
+      if (EOF == input_stats(twopass, &cpi->twopass_frame, &tmp_frame)) break;
     }
-    rc->next_key_frame_forced = 1;
-  } else if ((twopass->stats_in == twopass->stats_buf_ctx->stats_in_end &&
+    p_rc->next_key_frame_forced = 1;
+  } else if ((cpi->twopass_frame.stats_in ==
+                  twopass->stats_buf_ctx->stats_in_end &&
               is_stat_consumption_stage_twopass(cpi)) ||
              rc->frames_to_key >= kf_cfg->key_freq_max) {
-    rc->next_key_frame_forced = 1;
+    p_rc->next_key_frame_forced = 1;
   } else {
-    rc->next_key_frame_forced = 0;
+    p_rc->next_key_frame_forced = 0;
   }
 
-  if (kf_cfg->fwd_kf_enabled) rc->next_is_fwd_key |= rc->next_key_frame_forced;
-
-  // Special case for the last key frame of the file.
-  if (twopass->stats_in >= twopass->stats_buf_ctx->stats_in_end) {
-    // Accumulate kf group error.
-    kf_group_err +=
-        calculate_modified_err(frame_info, twopass, oxcf, this_frame);
-    rc->next_is_fwd_key = 0;
+  double kf_group_err = 0;
+  for (i = 0; i < rc->frames_to_key; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&twopass->firstpass_info, i);
+    if (this_stats != NULL) {
+      // Accumulate kf group error.
+      kf_group_err += calculate_modified_err_new(
+          frame_info, &firstpass_info->total_stats, this_stats,
+          oxcf->rc_cfg.vbrbias, twopass->modified_error_min,
+          twopass->modified_error_max);
+      ++p_rc->num_stats_used_for_kf_boost;
+    }
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
   if ((twopass->bits_left > 0 && twopass->modified_error_left > 0.0) ||
-      (cpi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
+      (cpi->ppi->lap_enabled && oxcf->rc_cfg.mode != AOM_Q)) {
     // Maximum number of bits for a single normal frame (not key frame).
     const int max_bits = frame_max_bits(rc, oxcf);
 
@@ -3270,8 +3106,9 @@
     int64_t max_grp_bits;
 
     if (oxcf->rc_cfg.vbr_corpus_complexity_lap) {
-      kf_group_avg_error = get_kf_group_avg_error(
-          twopass, &first_frame, start_position, rc->frames_to_key);
+      kf_group_avg_error =
+          get_kf_group_avg_error(twopass, &cpi->twopass_frame, &first_frame,
+                                 start_position, rc->frames_to_key);
     }
 
     // Default allocation based on bits left and relative
@@ -3287,7 +3124,7 @@
   }
   twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     // In the case of single pass based on LAP, frames to  key may have an
     // inaccurate value, and hence should be clipped to an appropriate
     // interval.
@@ -3304,13 +3141,13 @@
   }
 
   // Reset the first pass file position.
-  reset_fpf_position(twopass, start_position);
+  reset_fpf_position(&cpi->twopass_frame, start_position);
 
   // Scan through the kf group collating various stats used to determine
   // how many bits to spend on it.
   boost_score = get_kf_boost_score(cpi, kf_raw_err, &zero_motion_accumulator,
                                    &sr_accumulator, 0);
-  reset_fpf_position(twopass, start_position);
+  reset_fpf_position(&cpi->twopass_frame, start_position);
   // Store the zero motion percentage
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
@@ -3318,17 +3155,17 @@
   twopass->section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_buf_ctx->stats_in_end, rc->frames_to_key);
 
-  rc->kf_boost = (int)boost_score;
+  p_rc->kf_boost = (int)boost_score;
 
-  if (cpi->lap_enabled) {
+  if (cpi->ppi->lap_enabled) {
     if (oxcf->rc_cfg.mode == AOM_Q) {
-      rc->kf_boost = get_projected_kf_boost(cpi);
+      p_rc->kf_boost = get_projected_kf_boost(cpi);
     } else {
       // TODO(any): Explore using average frame stats for AOM_Q as well.
       boost_score = get_kf_boost_score(
           cpi, kf_raw_err, &zero_motion_accumulator, &sr_accumulator, 1);
-      reset_fpf_position(twopass, start_position);
-      rc->kf_boost += (int)boost_score;
+      reset_fpf_position(&cpi->twopass_frame, start_position);
+      p_rc->kf_boost += (int)boost_score;
     }
   }
 
@@ -3336,13 +3173,13 @@
   // if the kf group is very short.
   if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
       (rc->frames_to_key > 8)) {
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_STATIC_KF_BOOST);
   } else {
     // Apply various clamps for min and max boost
-    rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
-    rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, (rc->frames_to_key * 3));
+    p_rc->kf_boost = AOMMAX(p_rc->kf_boost, MIN_KF_BOOST);
 #ifdef STRICT_RC
-    rc->kf_boost = AOMMIN(rc->kf_boost, MAX_KF_BOOST);
+    p_rc->kf_boost = AOMMIN(p_rc->kf_boost, MAX_KF_BOOST);
 #endif
   }
 
@@ -3351,9 +3188,10 @@
   // very high, we calculate the bits based on a clipped value of
   // frames_to_key.
   kf_bits = calculate_boost_bits(
-      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, rc->kf_boost,
+      AOMMIN(rc->frames_to_key, frames_to_key_clipped) - 1, p_rc->kf_boost,
       AOMMIN(twopass->kf_group_bits, kf_group_bits_clipped));
-  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+  // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n",
+  // p_rc->kf_boost,
   //        kf_bits, twopass->kf_zeromotion_pct);
   kf_bits = adjust_boost_bits_for_target_level(cpi, rc, kf_bits,
                                                twopass->kf_group_bits, 0);
@@ -3365,13 +3203,13 @@
   gf_group->update_type[0] = KF_UPDATE;
 
   // Note the total error score of the kf group minus the key frame itself.
-  if (cpi->lap_enabled)
+  if (cpi->ppi->lap_enabled)
     // As we don't have enough stats to know the actual error of the group,
     // we assume the complexity of each frame to be equal to 1, and set the
     // error as the number of frames in the group(minus the keyframe).
-    twopass->kf_group_error_left = (int)(rc->frames_to_key - 1);
+    twopass->kf_group_error_left = (double)(rc->frames_to_key - 1);
   else
-    twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+    twopass->kf_group_error_left = kf_group_err - kf_mod_err;
 
   // Adjust the count of total modified error left.
   // The count of bits left is adjusted elsewhere based on real coded frame
@@ -3379,26 +3217,6 @@
   twopass->modified_error_left -= kf_group_err;
 }
 
-static int is_skippable_frame(const AV1_COMP *cpi) {
-  if (has_no_stats_stage(cpi)) return 0;
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const TWO_PASS *const twopass = &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-          twopass->stats_in - 2 > twopass->stats_buf_ctx->stats_in_start &&
-          twopass->stats_in < twopass->stats_buf_ctx->stats_in_end &&
-          (twopass->stats_in - 1)->pcnt_inter -
-                  (twopass->stats_in - 1)->pcnt_motion ==
-              1 &&
-          (twopass->stats_in - 2)->pcnt_inter -
-                  (twopass->stats_in - 2)->pcnt_motion ==
-              1 &&
-          twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 #define ARF_STATS_OUTPUT 0
 #if ARF_STATS_OUTPUT
 unsigned int arf_count = 0;
@@ -3408,34 +3226,58 @@
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   int section_target_bandwidth;
   const int frames_left = (int)(twopass->stats_buf_ctx->total_stats->count -
                                 current_frame->frame_number);
-  if (cpi->lap_enabled)
+  if (cpi->ppi->lap_enabled)
     section_target_bandwidth = (int)rc->avg_frame_bandwidth;
   else
     section_target_bandwidth = (int)(twopass->bits_left / frames_left);
   return section_target_bandwidth;
 }
 
+static INLINE void set_twopass_params_based_on_fp_stats(
+    AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame_ptr) {
+  if (this_frame_ptr == NULL) return;
+
+  TWO_PASS_FRAME *twopass_frame = &cpi->twopass_frame;
+  // The multiplication by 256 reverses a scaling factor of (>> 8)
+  // applied when combining MB error values for the frame.
+  twopass_frame->mb_av_energy = log((this_frame_ptr->intra_error) + 1.0);
+
+  const FIRSTPASS_STATS *const total_stats =
+      cpi->ppi->twopass.stats_buf_ctx->total_stats;
+  if (is_fp_wavelet_energy_invalid(total_stats) == 0) {
+    twopass_frame->frame_avg_haar_energy =
+        log((this_frame_ptr->frame_avg_wavelet_energy) + 1.0);
+  }
+
+  // Set the frame content type flag.
+  if (this_frame_ptr->intra_skip_pct >= FC_ANIMATION_THRESH)
+    twopass_frame->fr_content_type = FC_GRAPHICS_ANIMATION;
+  else
+    twopass_frame->fr_content_type = FC_NORMAL;
+}
+
 static void process_first_pass_stats(AV1_COMP *cpi,
                                      FIRSTPASS_STATS *this_frame) {
   AV1_COMMON *const cm = &cpi->common;
   CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  FIRSTPASS_STATS *total_stats = twopass->stats_buf_ctx->total_stats;
 
   if (cpi->oxcf.rc_cfg.mode != AOM_Q && current_frame->frame_number == 0 &&
-      cpi->gf_frame_index == 0 && cpi->twopass.stats_buf_ctx->total_stats &&
-      cpi->twopass.stats_buf_ctx->total_left_stats) {
-    if (cpi->lap_enabled) {
+      cpi->gf_frame_index == 0 && total_stats &&
+      cpi->ppi->twopass.stats_buf_ctx->total_left_stats) {
+    if (cpi->ppi->lap_enabled) {
       /*
        * Accumulate total_stats using available limited number of stats,
        * and assign it to total_left_stats.
        */
-      *cpi->twopass.stats_buf_ctx->total_left_stats =
-          *cpi->twopass.stats_buf_ctx->total_stats;
+      *cpi->ppi->twopass.stats_buf_ctx->total_left_stats = *total_stats;
     }
     // Special case code for first frame.
     const int section_target_bandwidth = get_section_target_bandwidth(cpi);
@@ -3455,42 +3297,24 @@
 
     rc->active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
-    rc->last_q[INTER_FRAME] = tmp_q;
-    rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
-    rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
-    rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
-    rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+    p_rc->last_q[INTER_FRAME] = tmp_q;
+    p_rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params->bit_depth);
+    p_rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+    p_rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.rc_cfg.best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[KEY_FRAME] = p_rc->last_q[KEY_FRAME];
   }
 
-  int err = 0;
-  if (cpi->lap_enabled) {
-    err = input_stats_lap(twopass, this_frame);
-  } else {
-    err = input_stats(twopass, this_frame);
+  if (cpi->twopass_frame.stats_in <
+      cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+    *this_frame = *cpi->twopass_frame.stats_in;
+    ++cpi->twopass_frame.stats_in;
   }
-  if (err == EOF) return;
-
-  {
-    const int num_mbs = (cpi->oxcf.resize_cfg.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cm->mi_params.MBs;
-    // The multiplication by 256 reverses a scaling factor of (>> 8)
-    // applied when combining MB error values for the frame.
-    twopass->mb_av_energy = log((this_frame->intra_error / num_mbs) + 1.0);
-    twopass->frame_avg_haar_energy =
-        log((this_frame->frame_avg_wavelet_energy / num_mbs) + 1.0);
-  }
-
-  // Set the frame content type flag.
-  if (this_frame->intra_skip_pct >= FC_ANIMATION_THRESH)
-    twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
-  else
-    twopass->fr_content_type = FC_NORMAL;
+  set_twopass_params_based_on_fp_stats(cpi, this_frame);
 }
 
 static void setup_target_rate(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
 
   int target_rate = gf_group->bit_allocation[cpi->gf_frame_index];
 
@@ -3502,19 +3326,156 @@
   rc->base_frame_target = target_rate;
 }
 
+static void mark_flashes(FIRSTPASS_STATS *first_stats,
+                         FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats = first_stats, *next_stats;
+  while (this_stats < last_stats - 1) {
+    next_stats = this_stats + 1;
+    if (next_stats->pcnt_second_ref > next_stats->pcnt_inter &&
+        next_stats->pcnt_second_ref >= 0.5) {
+      this_stats->is_flash = 1;
+    } else {
+      this_stats->is_flash = 0;
+    }
+    this_stats = next_stats;
+  }
+  // We always treat the last one as none flash.
+  if (last_stats - 1 >= first_stats) {
+    (last_stats - 1)->is_flash = 0;
+  }
+}
+
+// Estimate the noise variance of each frame from the first pass stats
+static void estimate_noise(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats, *next_stats;
+  double C1, C2, C3, noise;
+  int count = 0;
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    this_stats->noise_var = 0.0;
+    // flashes tend to have high correlation of innovations, so ignore them.
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+
+    C1 = (this_stats - 1)->intra_error *
+         (this_stats->intra_error - this_stats->coded_error);
+    C2 = (this_stats - 2)->intra_error *
+         ((this_stats - 1)->intra_error - (this_stats - 1)->coded_error);
+    C3 = (this_stats - 2)->intra_error *
+         (this_stats->intra_error - this_stats->sr_coded_error);
+    if (C1 <= 0 || C2 <= 0 || C3 <= 0) continue;
+    C1 = sqrt(C1);
+    C2 = sqrt(C2);
+    C3 = sqrt(C3);
+
+    noise = (this_stats - 1)->intra_error - C1 * C2 / C3;
+    noise = AOMMAX(noise, 0.01);
+    this_stats->noise_var = noise;
+    count++;
+  }
+
+  // Copy noise from the neighbor if the noise value is not trustworthy
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash)
+      continue;
+    if (this_stats->noise_var < 1.0) {
+      int found = 0;
+      // TODO(bohanli): consider expanding to two directions at the same time
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash || next_stats->noise_var < 1.0)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // copy the noise if this is a flash
+  for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) {
+    if (this_stats->is_flash || (this_stats - 1)->is_flash ||
+        (this_stats - 2)->is_flash) {
+      int found = 0;
+      for (next_stats = this_stats + 1; next_stats < last_stats; next_stats++) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        found = 1;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+      if (found) continue;
+      for (next_stats = this_stats - 1; next_stats >= first_stats + 2;
+           next_stats--) {
+        if (next_stats->is_flash || (next_stats - 1)->is_flash ||
+            (next_stats - 2)->is_flash)
+          continue;
+        this_stats->noise_var = next_stats->noise_var;
+        break;
+      }
+    }
+  }
+
+  // if we are at the first 2 frames, copy the noise
+  for (this_stats = first_stats;
+       this_stats < first_stats + 2 && (first_stats + 2) < last_stats;
+       this_stats++) {
+    this_stats->noise_var = (first_stats + 2)->noise_var;
+  }
+}
+
+// Estimate correlation coefficient of each frame with its previous frame.
+static void estimate_coeff(FIRSTPASS_STATS *first_stats,
+                           FIRSTPASS_STATS *last_stats) {
+  FIRSTPASS_STATS *this_stats;
+  for (this_stats = first_stats + 1; this_stats < last_stats; this_stats++) {
+    const double C =
+        sqrt(AOMMAX((this_stats - 1)->intra_error *
+                        (this_stats->intra_error - this_stats->coded_error),
+                    0.001));
+    const double cor_coeff =
+        C /
+        AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var, 0.001);
+
+    this_stats->cor_coeff =
+        cor_coeff *
+        sqrt(AOMMAX((this_stats - 1)->intra_error - this_stats->noise_var,
+                    0.001) /
+             AOMMAX(this_stats->intra_error - this_stats->noise_var, 0.001));
+    // clip correlation coefficient.
+    this_stats->cor_coeff = AOMMIN(AOMMAX(this_stats->cor_coeff, 0), 1);
+  }
+  first_stats->cor_coeff = 1.0;
+}
+
 void av1_get_second_pass_params(AV1_COMP *cpi,
                                 EncodeFrameParams *const frame_params,
                                 const EncodeFrameInput *const frame_input,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
 
-  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  const FIRSTPASS_STATS *const start_pos = cpi->twopass_frame.stats_in;
+  int update_total_stats = 0;
 
-  if (is_stat_consumption_stage(cpi) && !twopass->stats_in) return;
+  if (is_stat_consumption_stage(cpi) && !cpi->twopass_frame.stats_in) return;
 
+  assert(cpi->twopass_frame.stats_in != NULL);
   const int update_type = gf_group->update_type[cpi->gf_frame_index];
   frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
 
@@ -3526,206 +3487,185 @@
     // If this is an arf frame then we dont want to read the stats file or
     // advance the input pointer as we already have what we need.
     if (update_type == ARF_UPDATE || update_type == INTNL_ARF_UPDATE) {
-      // Do the firstpass stats indicate that this frame is skippable for the
-      // partition search?
-      if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) {
-        cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-      }
+      const FIRSTPASS_STATS *const this_frame_ptr =
+          read_frame_stats(twopass, &cpi->twopass_frame,
+                           gf_group->arf_src_offset[cpi->gf_frame_index]);
+      set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
       return;
     }
   }
 
-  aom_clear_system_state();
-
   if (oxcf->rc_cfg.mode == AOM_Q)
     rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   FIRSTPASS_STATS this_frame;
   av1_zero(this_frame);
   // call above fn
   if (is_stat_consumption_stage(cpi)) {
-    if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0)
+    if (cpi->gf_frame_index < gf_group->size || rc->frames_to_key == 0) {
       process_first_pass_stats(cpi, &this_frame);
+      update_total_stats = 1;
+    }
   } else {
     rc->active_worst_quality = oxcf->rc_cfg.cq_level;
   }
 
+  if (cpi->gf_frame_index == gf_group->size) {
+    if (cpi->ppi->lap_enabled && cpi->ppi->p_rc.enable_scenecut_detection) {
+      const int num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
+      const int frames_to_key = define_kf_interval(
+          cpi, &twopass->firstpass_info, num_frames_to_detect_scenecut,
+          /*search_start_idx=*/0);
+      if (frames_to_key != -1)
+        rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
+    }
+  }
+
   // Keyframe and section processing.
   FIRSTPASS_STATS this_frame_copy;
   this_frame_copy = this_frame;
-  int is_overlay_forward_kf =
-      rc->frames_to_key == 0 &&
-      gf_group->update_type[cpi->gf_frame_index] == OVERLAY_UPDATE;
-  if (rc->frames_to_key <= 0 && !is_overlay_forward_kf) {
-    assert(rc->frames_to_key >= -1);
+  if (rc->frames_to_key <= 0) {
+    assert(rc->frames_to_key == 0);
     // Define next KF group and assign bits to it.
-    int kf_offset = rc->frames_to_key;
-    if (rc->frames_to_key < 0) {
-      this_frame = *(twopass->stats_in - 1);
-    } else {
-      frame_params->frame_type = KEY_FRAME;
-    }
+    frame_params->frame_type = KEY_FRAME;
     find_next_key_frame(cpi, &this_frame);
-    rc->frames_since_key -= kf_offset;
-    rc->frames_to_key += kf_offset;
     this_frame = this_frame_copy;
-  } else {
-    const int altref_enabled = is_altref_enabled(oxcf->gf_cfg.lag_in_frames,
-                                                 oxcf->gf_cfg.enable_auto_arf);
-    const int sframe_dist = oxcf->kf_cfg.sframe_dist;
-    const int sframe_mode = oxcf->kf_cfg.sframe_mode;
-    CurrentFrame *const current_frame = &cpi->common.current_frame;
-    if (sframe_dist != 0) {
-      if (altref_enabled) {
-        if (sframe_mode == 1) {
-          // sframe_mode == 1: insert sframe if it matches altref frame.
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0 && update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-          }
-        } else {
-          // sframe_mode != 1: if sframe will be inserted at the next available
-          // altref frame
-          if (current_frame->frame_number % sframe_dist == 0 &&
-              current_frame->frame_number != 0) {
-            rc->sframe_due = 1;
-          }
-          if (rc->sframe_due && update_type == ARF_UPDATE) {
-            frame_params->frame_type = S_FRAME;
-            rc->sframe_due = 0;
-          }
-        }
-      } else {
-        if (current_frame->frame_number % sframe_dist == 0 &&
-            current_frame->frame_number != 0) {
-          frame_params->frame_type = S_FRAME;
-        }
-      }
-    }
   }
 
+  if (rc->frames_to_fwd_kf <= 0)
+    rc->frames_to_fwd_kf = oxcf->kf_cfg.fwd_kf_dist;
+
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (cpi->gf_frame_index == gf_group->size) {
-    assert(cpi->common.current_frame.frame_number == 0 ||
-           cpi->gf_frame_index == gf_group->size);
-    const FIRSTPASS_STATS *const start_position = twopass->stats_in;
-
-    if (cpi->lap_enabled && cpi->rc.enable_scenecut_detection) {
-      int num_frames_to_detect_scenecut, frames_to_key;
-      num_frames_to_detect_scenecut = MAX_GF_LENGTH_LAP + 1;
-      frames_to_key = define_kf_interval(cpi, &this_frame, NULL,
-                                         num_frames_to_detect_scenecut);
-      if (frames_to_key != -1)
-        rc->frames_to_key = AOMMIN(rc->frames_to_key, frames_to_key);
-    }
-
-    reset_fpf_position(twopass, start_position);
-
+#if CONFIG_BITRATE_ACCURACY
+    vbr_rc_reset_gop_data(&cpi->vbr_rc_info);
+#endif  // CONFIG_BITRATE_ACCURACY
     int max_gop_length =
-        (oxcf->gf_cfg.lag_in_frames >= 32 &&
-         is_stat_consumption_stage_twopass(cpi))
+        (oxcf->gf_cfg.lag_in_frames >= 32)
             ? AOMMIN(MAX_GF_INTERVAL, oxcf->gf_cfg.lag_in_frames -
                                           oxcf->algo_cfg.arnr_max_frames / 2)
             : MAX_GF_LENGTH_LAP;
 
+    // Use the provided gop size in low delay setting
+    if (oxcf->gf_cfg.lag_in_frames == 0) max_gop_length = rc->max_gf_interval;
+
     // Identify regions if needed.
+    // TODO(bohanli): identify regions for all stats available.
     if (rc->frames_since_key == 0 || rc->frames_since_key == 1 ||
-        (rc->frames_till_regions_update - rc->frames_since_key <
+        (p_rc->frames_till_regions_update - rc->frames_since_key <
              rc->frames_to_key &&
-         rc->frames_till_regions_update - rc->frames_since_key <
+         p_rc->frames_till_regions_update - rc->frames_since_key <
              max_gop_length + 1)) {
-      int is_first_stat =
-          twopass->stats_in == twopass->stats_buf_ctx->stats_in_start;
-      const FIRSTPASS_STATS *stats_start = twopass->stats_in + is_first_stat;
-      // offset of stats_start from the current frame
-      int offset = is_first_stat || (rc->frames_since_key == 0);
-      // offset of the region indices from the previous key frame
-      rc->regions_offset = rc->frames_since_key;
       // how many frames we can analyze from this frame
-      int rest_frames = AOMMIN(rc->frames_to_key + rc->next_is_fwd_key,
-                               MAX_FIRSTPASS_ANALYSIS_FRAMES);
+      int rest_frames =
+          AOMMIN(rc->frames_to_key, MAX_FIRSTPASS_ANALYSIS_FRAMES);
       rest_frames =
-          AOMMIN(rest_frames,
-                 (int)(twopass->stats_buf_ctx->stats_in_end - stats_start + 1) +
-                     offset);
+          AOMMIN(rest_frames, (int)(twopass->stats_buf_ctx->stats_in_end -
+                                    cpi->twopass_frame.stats_in +
+                                    (rc->frames_since_key == 0)));
+      p_rc->frames_till_regions_update = rest_frames;
 
-      rc->frames_till_regions_update = rest_frames;
-
-      identify_regions(stats_start, rest_frames - offset, offset, rc->regions,
-                       &rc->num_regions, rc->cor_coeff, rc->noise_var);
+      if (cpi->ppi->lap_enabled) {
+        mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+                     twopass->stats_buf_ctx->stats_in_end);
+        estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                       twopass->stats_buf_ctx->stats_in_end);
+        identify_regions(cpi->twopass_frame.stats_in, rest_frames,
+                         (rc->frames_since_key == 0), p_rc->regions,
+                         &p_rc->num_regions);
+      } else {
+        identify_regions(
+            cpi->twopass_frame.stats_in - (rc->frames_since_key == 0),
+            rest_frames, 0, p_rc->regions, &p_rc->num_regions);
+      }
     }
 
     int cur_region_idx =
-        find_regions_index(rc->regions, rc->num_regions,
-                           rc->frames_since_key - rc->regions_offset);
+        find_regions_index(p_rc->regions, p_rc->num_regions,
+                           rc->frames_since_key - p_rc->regions_offset);
     if ((cur_region_idx >= 0 &&
-         rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
+         p_rc->regions[cur_region_idx].type == SCENECUT_REGION) ||
         rc->frames_since_key == 0) {
       // If we start from a scenecut, then the last GOP's arf boost is not
       // needed for this GOP.
-      cpi->gf_state.arf_gf_boost_lst = 0;
+      cpi->ppi->gf_state.arf_gf_boost_lst = 0;
     }
 
-    // TODO(jingning): Resoleve the redundant calls here.
-    if (rc->intervals_till_gf_calculate_due == 0 || 1) {
-      calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+    int need_gf_len = 1;
+    if (cpi->third_pass_ctx && oxcf->pass == AOM_RC_THIRD_PASS) {
+      if (!cpi->third_pass_ctx->input_file_name && oxcf->two_pass_output) {
+        cpi->third_pass_ctx->input_file_name = oxcf->two_pass_output;
+      }
+      if (cpi->third_pass_ctx->input_file_name) {
+        int gf_len;
+        const int order_hint_bits =
+            cpi->common.seq_params->order_hint_info.order_hint_bits_minus_1 + 1;
+        av1_set_gop_third_pass(cpi->third_pass_ctx, gf_group, order_hint_bits,
+                               &gf_len);
+        p_rc->cur_gf_index = 0;
+        p_rc->gf_intervals[0] = gf_len;
+        need_gf_len = 0;
+      }
     }
 
-    if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
-        !cpi->sf.tpl_sf.disable_gop_length_decision) {
-      int this_idx = rc->frames_since_key + rc->gf_intervals[rc->cur_gf_index] -
-                     rc->regions_offset - 1;
-      int this_region =
-          find_regions_index(rc->regions, rc->num_regions, this_idx);
-      int next_region =
-          find_regions_index(rc->regions, rc->num_regions, this_idx + 1);
-      int is_last_scenecut =
-          (rc->gf_intervals[rc->cur_gf_index] >= rc->frames_to_key ||
-           rc->regions[this_region].type == SCENECUT_REGION ||
-           rc->regions[next_region].type == SCENECUT_REGION);
-      int ori_gf_int = rc->gf_intervals[rc->cur_gf_index];
+    if (need_gf_len) {
+      // TODO(jingning): Resolve the redundant calls here.
+      if (rc->intervals_till_gf_calculate_due == 0 || 1) {
+        calculate_gf_length(cpi, max_gop_length, MAX_NUM_GF_INTERVALS);
+      }
 
-      if (rc->gf_intervals[rc->cur_gf_index] > 16) {
-        // The calculate_gf_length function is previously used with
-        // max_gop_length = 32 with look-ahead gf intervals.
-        define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
-        this_frame = this_frame_copy;
-        int is_temporal_filter_enabled =
-            (rc->frames_since_key > 0 && gf_group->arf_index > -1);
-        if (is_temporal_filter_enabled) {
-          int arf_src_index = gf_group->arf_src_offset[gf_group->arf_index];
-          FRAME_UPDATE_TYPE arf_update_type =
-              gf_group->update_type[gf_group->arf_index];
-          int is_forward_keyframe = 0;
-          av1_temporal_filter(cpi, arf_src_index, arf_update_type,
-                              is_forward_keyframe, NULL);
-          aom_extend_frame_borders(&cpi->alt_ref_buffer,
-                                   av1_num_planes(&cpi->common));
-        }
-        if (!av1_tpl_setup_stats(cpi, 1, frame_params, frame_input)) {
-          // Tpl decides that a shorter gf interval is better.
-          // TODO(jingning): Remove redundant computations here.
-          max_gop_length = 16;
-          calculate_gf_length(cpi, max_gop_length, 1);
-          if (is_last_scenecut &&
-              (ori_gf_int - rc->gf_intervals[rc->cur_gf_index] < 4)) {
-            rc->gf_intervals[rc->cur_gf_index] = ori_gf_int;
+      if (max_gop_length > 16 && oxcf->algo_cfg.enable_tpl_model &&
+          oxcf->gf_cfg.lag_in_frames >= 32 &&
+          cpi->sf.tpl_sf.gop_length_decision_method != 3) {
+        int this_idx = rc->frames_since_key +
+                       p_rc->gf_intervals[p_rc->cur_gf_index] -
+                       p_rc->regions_offset - 1;
+        int this_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx);
+        int next_region =
+            find_regions_index(p_rc->regions, p_rc->num_regions, this_idx + 1);
+        // TODO(angiebird): Figure out why this_region and next_region are -1 in
+        // unit test like AltRefFramePresenceTestLarge (aomedia:3134)
+        int is_last_scenecut =
+            p_rc->gf_intervals[p_rc->cur_gf_index] >= rc->frames_to_key ||
+            (this_region != -1 &&
+             p_rc->regions[this_region].type == SCENECUT_REGION) ||
+            (next_region != -1 &&
+             p_rc->regions[next_region].type == SCENECUT_REGION);
+
+        int ori_gf_int = p_rc->gf_intervals[p_rc->cur_gf_index];
+
+        if (p_rc->gf_intervals[p_rc->cur_gf_index] > 16 &&
+            rc->min_gf_interval <= 16) {
+          // The calculate_gf_length function is previously used with
+          // max_gop_length = 32 with look-ahead gf intervals.
+          define_gf_group(cpi, frame_params, 0);
+          this_frame = this_frame_copy;
+
+          if (is_shorter_gf_interval_better(cpi, frame_params, frame_input)) {
+            // A shorter gf interval is better.
+            // TODO(jingning): Remove redundant computations here.
+            max_gop_length = 16;
+            calculate_gf_length(cpi, max_gop_length, 1);
+            if (is_last_scenecut &&
+                (ori_gf_int - p_rc->gf_intervals[p_rc->cur_gf_index] < 4)) {
+              p_rc->gf_intervals[p_rc->cur_gf_index] = ori_gf_int;
+            }
           }
-        } else {
-          // Tpl stats is reused only when the ARF frame is temporally filtered
-          if (is_temporal_filter_enabled)
-            cpi->tpl_data.skip_tpl_setup_stats = 1;
         }
       }
     }
-    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 0);
+
+    define_gf_group(cpi, frame_params, 0);
 
     if (gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE &&
         rc->frames_since_key > 0)
       process_first_pass_stats(cpi, &this_frame);
 
-    define_gf_group(cpi, &this_frame, frame_params, max_gop_length, 1);
+    define_gf_group(cpi, frame_params, 1);
 
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
     assert(cpi->gf_frame_index == 0);
 #if ARF_STATS_OUTPUT
     {
@@ -3734,8 +3674,8 @@
       ++arf_count;
       fprintf(fpfile, "%10d %10d %10d %10d %10d\n",
               cpi->common.current_frame.frame_number,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
+              rc->frames_till_gf_update_due, cpi->ppi->p_rc.kf_boost, arf_count,
+              p_rc->gfu_boost);
 
       fclose(fpfile);
     }
@@ -3745,34 +3685,37 @@
 
   if (gf_group->update_type[cpi->gf_frame_index] == ARF_UPDATE ||
       gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE) {
-    reset_fpf_position(twopass, start_pos);
+    reset_fpf_position(&cpi->twopass_frame, start_pos);
+
+    const FIRSTPASS_STATS *const this_frame_ptr =
+        read_frame_stats(twopass, &cpi->twopass_frame,
+                         gf_group->arf_src_offset[cpi->gf_frame_index]);
+    set_twopass_params_based_on_fp_stats(cpi, this_frame_ptr);
   } else {
-    // Update the total stats remaining structure.
-    if (twopass->stats_buf_ctx->total_left_stats)
-      subtract_stats(twopass->stats_buf_ctx->total_left_stats,
-                     &this_frame_copy);
+    // Back up this frame's stats for updating total stats during post encode.
+    cpi->twopass_frame.this_frame = update_total_stats ? start_pos : NULL;
   }
 
   frame_params->frame_type = gf_group->frame_type[cpi->gf_frame_index];
-
-  // Do the firstpass stats indicate that this frame is skippable for the
-  // partition search?
-  if (cpi->sf.part_sf.allow_partition_search_skip && oxcf->pass == 2) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   setup_target_rate(cpi);
 }
 
 void av1_init_second_pass(AV1_COMP *cpi) {
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FRAME_INFO *const frame_info = &cpi->frame_info;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
+  mark_flashes(twopass->stats_buf_ctx->stats_in_start,
+               twopass->stats_buf_ctx->stats_in_end);
+  estimate_noise(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+  estimate_coeff(twopass->stats_buf_ctx->stats_in_start,
+                 twopass->stats_buf_ctx->stats_in_end);
+
   stats = twopass->stats_buf_ctx->total_stats;
 
   *stats = *twopass->stats_buf_ctx->stats_in_end;
@@ -3788,6 +3731,11 @@
   twopass->bits_left =
       (int64_t)(stats->duration * oxcf->rc_cfg.target_bandwidth / 10000000.0);
 
+#if CONFIG_BITRATE_ACCURACY
+  vbr_rc_init(&cpi->vbr_rc_info, cpi->ppi->twopass.bits_left,
+              (int)round(stats->count));
+#endif
+
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
@@ -3796,7 +3744,7 @@
   {
     const double avg_error =
         stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
-    const FIRSTPASS_STATS *s = twopass->stats_in;
+    const FIRSTPASS_STATS *s = cpi->twopass_frame.stats_in;
     double modified_error_total = 0.0;
     twopass->modified_error_min =
         (avg_error * oxcf->rc_cfg.vbrmin_section) / 100;
@@ -3811,10 +3759,10 @@
   }
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
 
-  cpi->rc.rate_error_estimate = 0;
+  cpi->ppi->p_rc.rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
@@ -3829,7 +3777,7 @@
 }
 
 void av1_init_single_pass_lap(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
 
   if (!twopass->stats_buf_ctx->stats_in_end) return;
 
@@ -3842,10 +3790,10 @@
   twopass->modified_error_left = 0.0;
 
   // Reset the vbr bits off target counters
-  cpi->rc.vbr_bits_off_target = 0;
-  cpi->rc.vbr_bits_off_target_fast = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target = 0;
+  cpi->ppi->p_rc.vbr_bits_off_target_fast = 0;
 
-  cpi->rc.rate_error_estimate = 0;
+  cpi->ppi->p_rc.rate_error_estimate = 0;
 
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
@@ -3863,42 +3811,71 @@
 #define MINQ_ADJ_LIMIT_CQ 20
 #define HIGH_UNDERSHOOT_RATIO 2
 void av1_twopass_postencode_update(AV1_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
 
+  // Increment the stats_in pointer.
+  if (is_stat_consumption_stage(cpi) &&
+      (cpi->gf_frame_index < cpi->ppi->gf_group.size ||
+       rc->frames_to_key == 0)) {
+    const int update_type = cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+    if (update_type != ARF_UPDATE && update_type != INTNL_ARF_UPDATE) {
+      FIRSTPASS_STATS this_frame;
+      --cpi->twopass_frame.stats_in;
+      if (cpi->ppi->lap_enabled) {
+        input_stats_lap(twopass, &cpi->twopass_frame, &this_frame);
+      } else {
+        input_stats(twopass, &cpi->twopass_frame, &this_frame);
+      }
+    } else if (cpi->ppi->lap_enabled) {
+      cpi->twopass_frame.stats_in =
+          cpi->ppi->twopass.stats_buf_ctx->stats_in_start;
+    }
+  }
+
   // VBR correction is done through rc->vbr_bits_off_target. Based on the
   // sign of this value, a limited % adjustment is made to the target rate
   // of subsequent frames, to try and push it back towards 0. This method
   // is designed to prevent extreme behaviour at the end of a clip
   // or group of frames.
-  rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+  p_rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
   twopass->bits_left = AOMMAX(twopass->bits_left - rc->base_frame_target, 0);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  if (cpi->do_update_vbr_bits_off_target_fast) {
+    // Subtract current frame's fast_extra_bits.
+    p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+    rc->frame_level_fast_extra_bits = 0;
+  }
+#endif
+
   // Target vs actual bits for this arf group.
   twopass->rolling_arf_group_target_bits += rc->base_frame_target;
   twopass->rolling_arf_group_actual_bits += rc->projected_frame_size;
 
   // Calculate the pct rc error.
-  if (rc->total_actual_bits) {
-    rc->rate_error_estimate =
-        (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
-    rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+  if (p_rc->total_actual_bits) {
+    p_rc->rate_error_estimate =
+        (int)((p_rc->vbr_bits_off_target * 100) / p_rc->total_actual_bits);
+    p_rc->rate_error_estimate = clamp(p_rc->rate_error_estimate, -100, 100);
   } else {
-    rc->rate_error_estimate = 0;
+    p_rc->rate_error_estimate = 0;
   }
 
   // Update the active best quality pyramid.
   if (!rc->is_src_frame_alt_ref) {
-    const int pyramid_level = cpi->gf_group.layer_depth[cpi->gf_frame_index];
+    const int pyramid_level =
+        cpi->ppi->gf_group.layer_depth[cpi->gf_frame_index];
     int i;
     for (i = pyramid_level; i <= MAX_ARF_LAYERS; ++i) {
-      rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
+      p_rc->active_best_quality[i] = cpi->common.quant_params.base_qindex;
 #if CONFIG_TUNE_VMAF
       if (cpi->vmaf_info.original_qindex != -1 &&
           (cpi->oxcf.tune_cfg.tuning >= AOM_TUNE_VMAF_WITH_PREPROCESSING &&
            cpi->oxcf.tune_cfg.tuning <= AOM_TUNE_VMAF_NEG_MAX_GAIN)) {
-        rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
+        p_rc->active_best_quality[i] = cpi->vmaf_info.original_qindex;
       }
 #endif
     }
@@ -3914,16 +3891,16 @@
             " %10d %10d %10d %10.4lf %10.4lf %10.4lf %10.4lf\n",
             cm->current_frame.frame_number, rc->base_frame_target,
             rc->projected_frame_size, rc->total_actual_bits,
-            rc->vbr_bits_off_target, rc->rate_error_estimate,
+            rc->vbr_bits_off_target, p_rc->rate_error_estimate,
             twopass->rolling_arf_group_target_bits,
             twopass->rolling_arf_group_actual_bits,
             (double)twopass->rolling_arf_group_actual_bits /
                 (double)twopass->rolling_arf_group_target_bits,
             twopass->bpm_factor,
             av1_convert_qindex_to_q(cpi->common.quant_params.base_qindex,
-                                    cm->seq_params.bit_depth),
+                                    cm->seq_params->bit_depth),
             av1_convert_qindex_to_q(rc->active_worst_quality,
-                                    cm->seq_params.bit_depth));
+                                    cm->seq_params->bit_depth));
     fclose(fpfile);
   }
 #endif
@@ -3936,36 +3913,43 @@
 
   // If the rate control is drifting consider adjustment to min or maxq.
   if ((rc_cfg->mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
-    const int maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
-    const int minq_adj_limit =
+    int maxq_adj_limit;
+    int minq_adj_limit;
+    maxq_adj_limit = rc->worst_quality - rc->active_worst_quality;
+    minq_adj_limit =
         (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
-
     // Undershoot.
-    if (rc->rate_error_estimate > rc_cfg->under_shoot_pct) {
+    if (p_rc->rate_error_estimate > rc_cfg->under_shoot_pct) {
       --twopass->extend_maxq;
-      if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits >= p_rc->rolling_actual_bits)
         ++twopass->extend_minq;
       // Overshoot.
-    } else if (rc->rate_error_estimate < -rc_cfg->over_shoot_pct) {
+    } else if (p_rc->rate_error_estimate < -rc_cfg->over_shoot_pct) {
       --twopass->extend_minq;
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits < p_rc->rolling_actual_bits)
         ++twopass->extend_maxq;
     } else {
       // Adjustment for extreme local overshoot.
       if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
           rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
         ++twopass->extend_maxq;
-
       // Unwind undershoot or overshoot adjustment.
-      if (rc->rolling_target_bits < rc->rolling_actual_bits)
+      if (p_rc->rolling_target_bits < p_rc->rolling_actual_bits)
         --twopass->extend_minq;
-      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+      else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits)
         --twopass->extend_maxq;
     }
-
     twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
     twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+        p_rc->vbr_bits_off_target_fast) {
+      // Subtract current frame's fast_extra_bits.
+      p_rc->vbr_bits_off_target_fast -= rc->frame_level_fast_extra_bits;
+    }
+#endif
+
     // If there is a big and undexpected undershoot then feed the extra
     // bits back in quickly. One situation where this may happen is if a
     // frame is unexpectedly almost perfectly predicted by the ARF or GF
@@ -3973,19 +3957,19 @@
     if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
       int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
       if (rc->projected_frame_size < fast_extra_thresh) {
-        rc->vbr_bits_off_target_fast +=
+        p_rc->vbr_bits_off_target_fast +=
             fast_extra_thresh - rc->projected_frame_size;
-        rc->vbr_bits_off_target_fast =
-            AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+        p_rc->vbr_bits_off_target_fast = AOMMIN(p_rc->vbr_bits_off_target_fast,
+                                                (4 * rc->avg_frame_bandwidth));
 
         // Fast adaptation of minQ if necessary to use up the extra bits.
         if (rc->avg_frame_bandwidth) {
-          twopass->extend_minq_fast =
-              (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+          twopass->extend_minq_fast = (int)(p_rc->vbr_bits_off_target_fast * 8 /
+                                            rc->avg_frame_bandwidth);
         }
         twopass->extend_minq_fast = AOMMIN(
             twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
-      } else if (rc->vbr_bits_off_target_fast) {
+      } else if (p_rc->vbr_bits_off_target_fast) {
         twopass->extend_minq_fast = AOMMIN(
             twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
       } else {
@@ -3993,4 +3977,84 @@
       }
     }
   }
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Update the frame probabilities obtained from parallel encode frames
+  FrameProbInfo *const frame_probs = &cpi->ppi->frame_probs;
+  int i, j, loop;
+  // Sequentially do average on temp_frame_probs_simulation which holds
+  // probabilities of last frame before parallel encode
+  for (loop = 0; loop <= cpi->num_frame_recode; loop++) {
+    // Sequentially update tx_type_probs
+    if (cpi->do_update_frame_probs_txtype[loop] &&
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      for (i = 0; i < TX_SIZES_ALL; i++) {
+        int left = 1024;
+
+        for (j = TX_TYPES - 1; j >= 0; j--) {
+          const int new_prob =
+              cpi->frame_new_probs[loop].tx_type_probs[update_type][i][j];
+          int prob =
+              (frame_probs->tx_type_probs[update_type][i][j] + new_prob) >> 1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->tx_type_probs[update_type][i][j] = prob;
+        }
+      }
+    }
+
+    // Sequentially update obmc_probs
+    if (cpi->do_update_frame_probs_obmc[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+        const int new_prob =
+            cpi->frame_new_probs[loop].obmc_probs[update_type][i];
+        frame_probs->obmc_probs[update_type][i] =
+            (frame_probs->obmc_probs[update_type][i] + new_prob) >> 1;
+      }
+    }
+
+    // Sequentially update warped_probs
+    if (cpi->do_update_frame_probs_warp[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+      const int new_prob = cpi->frame_new_probs[loop].warped_probs[update_type];
+      frame_probs->warped_probs[update_type] =
+          (frame_probs->warped_probs[update_type] + new_prob) >> 1;
+    }
+
+    // Sequentially update switchable_interp_probs
+    if (cpi->do_update_frame_probs_interpfilter[loop] &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      const FRAME_UPDATE_TYPE update_type =
+          get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+
+      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
+        int left = 1536;
+
+        for (j = SWITCHABLE_FILTERS - 1; j >= 0; j--) {
+          const int new_prob = cpi->frame_new_probs[loop]
+                                   .switchable_interp_probs[update_type][i][j];
+          int prob = (frame_probs->switchable_interp_probs[update_type][i][j] +
+                      new_prob) >>
+                     1;
+          left -= prob;
+          if (j == 0) prob += left;
+          frame_probs->switchable_interp_probs[update_type][i][j] = prob;
+        }
+      }
+    }
+  }
+
+  // Update framerate obtained from parallel encode frames
+  if (cpi->common.show_frame &&
+      cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+    cpi->framerate = cpi->new_framerate;
+#endif
 }

diff --git a/av1/encoder/pass2_strategy.h b/av1/encoder/pass2_strategy.h
index 9f6ce22..913a79f 100644
--- a/av1/encoder/pass2_strategy.h
+++ b/av1/encoder/pass2_strategy.h

@@ -42,10 +42,7 @@
   double abs_mv_in_out_accumulator;
 
   double avg_sr_coded_error;
-  double avg_tr_coded_error;
   double avg_pcnt_second_ref;
-  double avg_pcnt_third_ref;
-  double avg_pcnt_third_ref_nolast;
   double avg_new_mv_count;
   double avg_wavelet_energy;
   double avg_raw_err_stdev;
@@ -61,7 +58,6 @@
   double frame_err;
   double frame_coded_error;
   double frame_sr_coded_error;
-  double frame_tr_coded_error;
   /*!\endcond */
 } GF_FRAME_STATS;
 /*!cond */

diff --git a/av1/encoder/pickcdef.c b/av1/encoder/pickcdef.c
index 55e466d..676c55d 100644
--- a/av1/encoder/pickcdef.c
+++ b/av1/encoder/pickcdef.c

@@ -16,7 +16,6 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_integer.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/encoder.h"
@@ -29,9 +28,11 @@
                                              int *pri_strength,
                                              int *sec_strength,
                                              int strength_idx) {
-  const int tot_sec_filter = (pick_method >= CDEF_FAST_SEARCH_LVL3)
-                                 ? REDUCED_SEC_STRENGTHS_LVL3
-                                 : CDEF_SEC_STRENGTHS;
+  const int tot_sec_filter =
+      (pick_method == CDEF_FAST_SEARCH_LVL5)
+          ? REDUCED_SEC_STRENGTHS_LVL5
+          : ((pick_method >= CDEF_FAST_SEARCH_LVL3) ? REDUCED_SEC_STRENGTHS_LVL3
+                                                    : CDEF_SEC_STRENGTHS);
   const int pri_idx = strength_idx / tot_sec_filter;
   const int sec_idx = strength_idx % tot_sec_filter;
   *pri_strength = pri_idx;
@@ -49,6 +50,10 @@
       *pri_strength = priconv_lvl4[pri_idx];
       *sec_strength = secconv_lvl3[sec_idx];
       break;
+    case CDEF_FAST_SEARCH_LVL5:
+      *pri_strength = priconv_lvl5[pri_idx];
+      *sec_strength = secconv_lvl5[sec_idx];
+      break;
     default: assert(0 && "Invalid CDEF search method");
   }
 }
@@ -154,7 +159,7 @@
                                       CDEF_PICK_METHOD pick_method) {
   uint64_t best_tot_mse;
   int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
-              pick_method <= CDEF_FAST_SEARCH_LVL4);
+              pick_method <= CDEF_FAST_SEARCH_LVL5);
   int i;
   best_tot_mse = (uint64_t)1 << 63;
   /* Greedy search: add one strength options at a time. */
@@ -454,13 +459,13 @@
       (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   cdef_search_ctx->nhfb =
       (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
-  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+  cdef_search_ctx->coeff_shift = AOMMAX(cm->seq_params->bit_depth - 8, 0);
   cdef_search_ctx->damping = 3 + (cm->quant_params.base_qindex >> 6);
   cdef_search_ctx->total_strengths = nb_cdef_strengths[pick_method];
   cdef_search_ctx->num_planes = num_planes;
   cdef_search_ctx->pick_method = pick_method;
   cdef_search_ctx->sb_count = 0;
-  av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+  av1_setup_dst_planes(xd->plane, cm->seq_params->sb_size, frame, 0, 0, 0,
                        num_planes);
   // Initialize plane wise information.
   for (int pli = 0; pli < num_planes; pli++) {
@@ -478,7 +483,7 @@
   }
   // Function pointer initialization.
 #if CONFIG_AV1_HIGHBITDEPTH
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     cdef_search_ctx->copy_fn = copy_sb16_16_highbd;
     cdef_search_ctx->compute_cdef_dist_fn = compute_cdef_dist_highbd;
   } else {
@@ -491,20 +496,26 @@
 #endif
 }
 
-static void pick_cdef_from_qp(AV1_COMMON *const cm) {
-  const int bd = cm->seq_params.bit_depth;
+static void pick_cdef_from_qp(AV1_COMMON *const cm, int skip_cdef,
+                              int frames_since_key) {
+  const int bd = cm->seq_params->bit_depth;
   const int q =
       av1_ac_quant_QTX(cm->quant_params.base_qindex, 0, bd) >> (bd - 8);
   CdefInfo *const cdef_info = &cm->cdef_info;
-  cdef_info->cdef_bits = 0;
-  cdef_info->nb_cdef_strengths = 1;
+  // Check the speed feature to avoid extra signaling.
+  if (skip_cdef) {
+    cdef_info->cdef_bits = 1;
+    cdef_info->nb_cdef_strengths = 2;
+  } else {
+    cdef_info->cdef_bits = 0;
+    cdef_info->nb_cdef_strengths = 1;
+  }
   cdef_info->cdef_damping = 3 + (cm->quant_params.base_qindex >> 6);
 
   int predicted_y_f1 = 0;
   int predicted_y_f2 = 0;
   int predicted_uv_f1 = 0;
   int predicted_uv_f2 = 0;
-  aom_clear_system_state();
   if (!frame_is_intra_only(cm)) {
     predicted_y_f1 = clamp((int)roundf(q * q * -0.0000023593946f +
                                        q * 0.0068615186f + 0.02709886f),
@@ -537,13 +548,22 @@
   cdef_info->cdef_uv_strengths[0] =
       predicted_uv_f1 * CDEF_SEC_STRENGTHS + predicted_uv_f2;
 
+  if (skip_cdef) {
+    cdef_info->cdef_strengths[1] = 0;
+    cdef_info->cdef_uv_strengths[1] = 0;
+  }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int nvfb = (mi_params->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   const int nhfb = (mi_params->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
   MB_MODE_INFO **mbmi = mi_params->mi_grid_base;
   for (int r = 0; r < nvfb; ++r) {
     for (int c = 0; c < nhfb; ++c) {
-      mbmi[MI_SIZE_64X64 * c]->cdef_strength = 0;
+      MB_MODE_INFO *current_mbmi = mbmi[MI_SIZE_64X64 * c];
+      current_mbmi->cdef_strength = 0;
+      if (skip_cdef && current_mbmi->skip_cdef_curr_sb &&
+          frames_since_key > 10) {
+        current_mbmi->cdef_strength = 1;
+      }
     }
     mbmi += MI_SIZE_64X64 * mi_params->mi_stride;
   }
@@ -551,16 +571,16 @@
 
 void av1_cdef_search(MultiThreadInfo *mt_info, const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method,
-                     int rdmult) {
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key) {
   if (pick_method == CDEF_PICK_FROM_Q) {
-    pick_cdef_from_qp(cm);
+    pick_cdef_from_qp(cm, skip_cdef_feature, frames_since_key);
     return;
   }
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int damping = 3 + (cm->quant_params.base_qindex >> 6);
   const int fast = (pick_method >= CDEF_FAST_SEARCH_LVL1 &&
-                    pick_method <= CDEF_FAST_SEARCH_LVL4);
+                    pick_method <= CDEF_FAST_SEARCH_LVL5);
   const int num_planes = av1_num_planes(cm);
   CdefSearchCtx cdef_search_ctx;
   // Initialize parameters related to CDEF search context.
@@ -628,7 +648,6 @@
     mi_params->mi_grid_base[cdef_search_ctx.sb_index[i]]->cdef_strength =
         best_gi;
   }
-
   if (fast) {
     for (int j = 0; j < cdef_info->nb_cdef_strengths; j++) {
       const int luma_strength = cdef_info->cdef_strengths[j];

diff --git a/av1/encoder/pickcdef.h b/av1/encoder/pickcdef.h
index 7fe1edb..a287870 100644
--- a/av1/encoder/pickcdef.h
+++ b/av1/encoder/pickcdef.h

@@ -24,6 +24,7 @@
 #define REDUCED_PRI_STRENGTHS_LVL1 8
 #define REDUCED_PRI_STRENGTHS_LVL2 5
 #define REDUCED_SEC_STRENGTHS_LVL3 2
+#define REDUCED_SEC_STRENGTHS_LVL5 1
 #define REDUCED_PRI_STRENGTHS_LVL4 2
 
 #define REDUCED_TOTAL_STRENGTHS_LVL1 \
@@ -34,19 +35,24 @@
   (REDUCED_PRI_STRENGTHS_LVL2 * REDUCED_SEC_STRENGTHS_LVL3)
 #define REDUCED_TOTAL_STRENGTHS_LVL4 \
   (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL3)
+#define REDUCED_TOTAL_STRENGTHS_LVL5 \
+  (REDUCED_PRI_STRENGTHS_LVL4 * REDUCED_SEC_STRENGTHS_LVL5)
 #define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
 
 static const int priconv_lvl1[REDUCED_PRI_STRENGTHS_LVL1] = { 0, 1, 2,  3,
                                                               5, 7, 10, 13 };
 static const int priconv_lvl2[REDUCED_PRI_STRENGTHS_LVL2] = { 0, 2, 4, 8, 14 };
 static const int priconv_lvl4[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 11 };
+static const int priconv_lvl5[REDUCED_PRI_STRENGTHS_LVL4] = { 0, 5 };
 static const int secconv_lvl3[REDUCED_SEC_STRENGTHS_LVL3] = { 0, 2 };
+static const int secconv_lvl5[REDUCED_SEC_STRENGTHS_LVL5] = { 0 };
 static const int nb_cdef_strengths[CDEF_PICK_METHODS] = {
   TOTAL_STRENGTHS,
   REDUCED_TOTAL_STRENGTHS_LVL1,
   REDUCED_TOTAL_STRENGTHS_LVL2,
   REDUCED_TOTAL_STRENGTHS_LVL3,
   REDUCED_TOTAL_STRENGTHS_LVL4,
+  REDUCED_TOTAL_STRENGTHS_LVL5,
   TOTAL_STRENGTHS
 };
 
@@ -58,20 +64,6 @@
                                         BLOCK_SIZE bsize, int coeff_shift,
                                         int row, int col);
 
-// Data related to CDEF search multi-thread synchronization.
-typedef struct AV1CdefSyncData {
-#if CONFIG_MULTITHREAD
-  // Mutex lock used while dispatching jobs.
-  pthread_mutex_t *mutex_;
-#endif  // CONFIG_MULTITHREAD
-  // Flag to indicate all blocks are processed and end of frame is reached
-  int end_of_frame;
-  // Row index in units of 64x64 block
-  int fbr;
-  // Column index in units of 64x64 block
-  int fbc;
-} AV1CdefSync;
-
 /*! \brief CDEF search context.
  */
 typedef struct {
@@ -224,6 +216,8 @@
  * \param[in]      xd           Pointer to common current coding block structure
  * \param[in]      pick_method  The method used to select params
  * \param[in]      rdmult       rd multiplier to use in making param choices
+ * \param[in]      skip_cdef_feature Speed feature to skip cdef
+ * \param[in]      frames_since_key Number of frames since key frame
  *
  * \return Nothing is returned. Instead, optimal CDEF parameters are stored
  * in the \c cdef_info structure of type \ref CdefInfo inside \c cm:
@@ -239,7 +233,8 @@
 void av1_cdef_search(struct MultiThreadInfo *mt_info,
                      const YV12_BUFFER_CONFIG *frame,
                      const YV12_BUFFER_CONFIG *ref, AV1_COMMON *cm,
-                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult);
+                     MACROBLOCKD *xd, CDEF_PICK_METHOD pick_method, int rdmult,
+                     int skip_cdef_feature, int frames_since_key);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index 9b3924f..7608749 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c

@@ -39,8 +39,8 @@
 
 int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (is_stat_consumption_stage_twopass(cpi)) {
-    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                                 : MAX_LOOP_FILTER;
+    return cpi->ppi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+                                                      : MAX_LOOP_FILTER;
   } else {
     return MAX_LOOP_FILTER;
   }
@@ -69,25 +69,12 @@
     case 2: cm->lf.filter_level_v = filter_level[0]; break;
   }
 
-  // TODO(any): please enable multi-thread and remove the flag when loop
-  // filter mask is compatible with multi-thread.
-  if (num_workers > 1)
-    av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
-                             plane + 1, partial_frame,
-#if CONFIG_LPF_MASK
-                             0,
-#endif
-                             mt_info->workers, num_workers,
-                             &mt_info->lf_row_sync);
-  else
-    av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
-#if CONFIG_LPF_MASK
-                          0,
-#endif
-                          plane, plane + 1, partial_frame);
+  av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
+                           plane + 1, partial_frame, mt_info->workers,
+                           num_workers, &mt_info->lf_row_sync, 0);
 
   filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
-                               cm->seq_params.use_highbitdepth);
+                               cm->seq_params->use_highbitdepth);
 
   // Re-instate the unfiltered frame
   yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
@@ -153,8 +140,8 @@
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((is_stat_consumption_stage_twopass(cpi)) &&
-        (cpi->twopass.section_intra_rating < 20))
-      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+        (cpi->ppi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->ppi->twopass.section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->features.tx_mode != ONLY_4X4) bias >>= 1;
@@ -205,7 +192,7 @@
 
   if (best_cost_ret)
     *best_cost_ret = RDCOST_DBL_WITH_NATIVE_BD_DIST(
-        x->rdmult, 0, (best_err << 4), cm->seq_params.bit_depth);
+        x->rdmult, 0, (best_err << 4), cm->seq_params->bit_depth);
   return filt_best;
 }
 
@@ -226,7 +213,7 @@
     const int min_filter_level = 0;
     const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant_QTX(cm->quant_params.base_qindex, 0,
-                                   cm->seq_params.bit_depth);
+                                   cm->seq_params->bit_depth);
     // based on tests result for rtc test set
     // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
     const int strength_boost_q_treshold = 0;
@@ -244,7 +231,7 @@
     // And high bit depth separately:
     // filt_guess = q * 0.316206 + 3.87252
     int filt_guess;
-    switch (cm->seq_params.bit_depth) {
+    switch (cm->seq_params->bit_depth) {
       case AOM_BITS_8:
         filt_guess =
             (cm->current_frame.frame_type == KEY_FRAME)
@@ -263,7 +250,7 @@
                "or AOM_BITS_12");
         return;
     }
-    if (cm->seq_params.bit_depth != AOM_BITS_8 &&
+    if (cm->seq_params->bit_depth != AOM_BITS_8 &&
         cm->current_frame.frame_type == KEY_FRAME)
       filt_guess -= 4;
     // TODO(chengchen): retrain the model for Y, U, V filter levels
@@ -272,10 +259,20 @@
     lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
     lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    const int last_frame_filter_level[4] = { lf->filter_level[0],
-                                             lf->filter_level[1],
-                                             lf->filter_level_u,
-                                             lf->filter_level_v };
+    int last_frame_filter_level[4] = { 0 };
+    if (!frame_is_intra_only(cm)) {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      last_frame_filter_level[0] = cpi->ppi->filter_level[0];
+      last_frame_filter_level[1] = cpi->ppi->filter_level[1];
+      last_frame_filter_level[2] = cpi->ppi->filter_level_u;
+      last_frame_filter_level[3] = cpi->ppi->filter_level_v;
+#else
+      last_frame_filter_level[0] = lf->filter_level[0];
+      last_frame_filter_level[1] = lf->filter_level[1];
+      last_frame_filter_level[2] = lf->filter_level_u;
+      last_frame_filter_level[3] = lf->filter_level_v;
+#endif
+    }
 
     lf->filter_level[0] = lf->filter_level[1] =
         search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 2196513..176f98f 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c

@@ -19,17 +19,16 @@
 
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/mathutils.h"
 #include "aom_dsp/psnr.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/common/av1_common_int.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/restoration.h"
 
 #include "av1/encoder/av1_quantize.h"
 #include "av1/encoder/encoder.h"
-#include "av1/encoder/mathutils.h"
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/pickrst.h"
 
@@ -199,8 +198,8 @@
   const int is_uv = plane > 0;
   const RestorationInfo *rsi = &cm->rst_info[plane];
   RestorationLineBuffers rlbs;
-  const int bit_depth = cm->seq_params.bit_depth;
-  const int highbd = cm->seq_params.use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
   // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
@@ -209,8 +208,8 @@
 
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
-      is_uv && cm->seq_params.subsampling_x,
-      is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+      is_uv && cm->seq_params->subsampling_x,
+      is_uv && cm->seq_params->subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
       rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
 
@@ -774,12 +773,10 @@
   int exq[2];
   apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
             pu_width, pu_height, flt0, flt1, flt_stride);
-  aom_clear_system_state();
   const sgr_params_type *const params = &av1_sgr_params[ep];
   get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
                     use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
                     params);
-  aom_clear_system_state();
   encode_xq(exq, exqd, params);
   *err = finer_search_pixel_proj_error(
       src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
@@ -886,8 +883,8 @@
 
   const MACROBLOCK *const x = rsc->x;
   const AV1_COMMON *const cm = rsc->cm;
-  const int highbd = cm->seq_params.use_highbitdepth;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int highbd = cm->seq_params->use_highbitdepth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   const int64_t bits_none = x->mode_costs.sgrproj_restore_cost[0];
   // Prune evaluation of RESTORE_SGRPROJ if 'skip_sgr_eval' is set
@@ -905,8 +902,8 @@
       rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
 
   const int is_uv = rsc->plane > 0;
-  const int ss_x = is_uv && cm->seq_params.subsampling_x;
-  const int ss_y = is_uv && cm->seq_params.subsampling_y;
+  const int ss_x = is_uv && cm->seq_params->subsampling_x;
+  const int ss_y = is_uv && cm->seq_params->subsampling_y;
   const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
   const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
 
@@ -1247,8 +1244,6 @@
   const int plane_off = (WIENER_WIN - wiener_win) >> 1;
   const int wiener_win2 = wiener_win * wiener_win;
 
-  aom_clear_system_state();
-
   a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = WIENER_FILT_STEP;
   for (i = 0; i < WIENER_HALFWIN; ++i) {
     a[i] = a[WIENER_WIN - i - 1] = vfilt[i];
@@ -1474,12 +1469,12 @@
     const int scale[3] = { 0, 1, 2 };
     // Obtain the normalized Qscale
     const int qs = av1_dc_quant_QTX(rsc->cm->quant_params.base_qindex, 0,
-                                    rsc->cm->seq_params.bit_depth) >>
+                                    rsc->cm->seq_params->bit_depth) >>
                    3;
     // Derive threshold as sqr(normalized Qscale) * scale / 16,
     const uint64_t thresh =
         (qs * qs * scale[rsc->lpf_sf->prune_wiener_based_on_src_var]) >> 4;
-    const int highbd = rsc->cm->seq_params.use_highbitdepth;
+    const int highbd = rsc->cm->seq_params->use_highbitdepth;
     const uint64_t src_var =
         var_restoration_unit(limits, rsc->src, rsc->plane, highbd);
     // Do not perform Wiener search if source variance is lower than threshold
@@ -1510,11 +1505,11 @@
 
 #if CONFIG_AV1_HIGHBITDEPTH
   const AV1_COMMON *const cm = rsc->cm;
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     av1_compute_stats_highbd(reduced_wiener_win, rsc->dgd_buffer,
                              rsc->src_buffer, limits->h_start, limits->h_end,
                              limits->v_start, limits->v_end, rsc->dgd_stride,
-                             rsc->src_stride, M, H, cm->seq_params.bit_depth);
+                             rsc->src_stride, M, H, cm->seq_params->bit_depth);
   } else {
     av1_compute_stats(reduced_wiener_win, rsc->dgd_buffer, rsc->src_buffer,
                       limits->h_start, limits->h_end, limits->v_start,
@@ -1547,8 +1542,6 @@
     return;
   }
 
-  aom_clear_system_state();
-
   rusi->sse[RESTORE_WIENER] = finer_tile_search_wiener(
       rsc, limits, tile_rect, &rui, reduced_wiener_win);
   rusi->wiener = rui.wiener_info;
@@ -1567,10 +1560,10 @@
 
   double cost_none = RDCOST_DBL_WITH_NATIVE_BD_DIST(
       x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE],
-      rsc->cm->seq_params.bit_depth);
+      rsc->cm->seq_params->bit_depth);
   double cost_wiener = RDCOST_DBL_WITH_NATIVE_BD_DIST(
       x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER],
-      rsc->cm->seq_params.bit_depth);
+      rsc->cm->seq_params->bit_depth);
 
   RestorationType rtype =
       (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
@@ -1601,7 +1594,7 @@
   RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
   RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
 
-  const int highbd = rsc->cm->seq_params.use_highbitdepth;
+  const int highbd = rsc->cm->seq_params->use_highbitdepth;
   rusi->sse[RESTORE_NONE] = sse_restoration_unit(
       limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
 
@@ -1653,8 +1646,8 @@
     }
     const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
     const int64_t bits = x->mode_costs.switchable_restore_cost[r] + coeff_bits;
-    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(x->rdmult, bits >> 4, sse,
-                                                 rsc->cm->seq_params.bit_depth);
+    double cost = RDCOST_DBL_WITH_NATIVE_BD_DIST(
+        x->rdmult, bits >> 4, sse, rsc->cm->seq_params->bit_depth);
     if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
       cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->lpf_sf->dual_sgr_penalty_level);
     if (r == 0 || cost < best_cost) {
@@ -1694,7 +1687,7 @@
   av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
                                  &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
   return RDCOST_DBL_WITH_NATIVE_BD_DIST(
-      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params.bit_depth);
+      rsc->x->rdmult, rsc->bits >> 4, rsc->sse, rsc->cm->seq_params->bit_depth);
 }
 
 static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
@@ -1740,8 +1733,9 @@
     double best_cost = 0;
     RestorationType best_rtype = RESTORE_NONE;
 
-    const int highbd = rsc.cm->seq_params.use_highbitdepth;
-    if (!cpi->sf.lpf_sf.disable_loop_restoration_chroma || !plane) {
+    const int highbd = rsc.cm->seq_params->use_highbitdepth;
+    if ((plane && !cpi->sf.lpf_sf.disable_loop_restoration_chroma) ||
+        (!plane && !cpi->sf.lpf_sf.disable_loop_restoration_luma)) {
       av1_extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
                        rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
                        highbd);

diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
index 2463361..46a4b48 100644
--- a/av1/encoder/pickrst.h
+++ b/av1/encoder/pickrst.h

@@ -16,7 +16,6 @@
 #endif
 
 #include "av1/encoder/encoder.h"
-#include "aom_ports/system_state.h"
 
 struct yv12_buffer_config;
 struct AV1_COMP;

diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c
index 07e1a5f..ff00a46 100644
--- a/av1/encoder/ransac.c
+++ b/av1/encoder/ransac.c

@@ -15,8 +15,8 @@
 #include <stdlib.h>
 #include <assert.h>
 
+#include "aom_dsp/mathutils.h"
 #include "av1/encoder/ransac.h"
-#include "av1/encoder/mathutils.h"
 #include "av1/encoder/random.h"
 
 #define MAX_MINPTS 4

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index 3ab1597..292bdeb 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c

@@ -19,7 +19,6 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/alloccommon.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
@@ -232,31 +231,34 @@
     const int layer =
         LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
     LAYER_CONTEXT *lc = &svc->layer_context[layer];
-    RATE_CONTROL *lrc = &lc->rc;
-    lrc->bits_off_target +=
+    PRIMARY_RATE_CONTROL *lp_rc = &lc->p_rc;
+    lp_rc->bits_off_target +=
         (int)(lc->target_bandwidth / lc->framerate) - encoded_frame_size;
     // Clip buffer level to maximum buffer size for the layer.
-    lrc->bits_off_target =
-        AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-    lrc->buffer_level = lrc->bits_off_target;
+    lp_rc->bits_off_target =
+        AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+    lp_rc->buffer_level = lp_rc->bits_off_target;
   }
 }
 // Update the buffer level: leaky bucket model.
 static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
   if (!cm->show_frame)
-    rc->bits_off_target -= encoded_frame_size;
+    p_rc->bits_off_target -= encoded_frame_size;
   else
-    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+    p_rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
 
   // Clip the buffer level to the maximum specified buffer size.
-  rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
-  rc->buffer_level = rc->bits_off_target;
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = p_rc->bits_off_target;
 
-  if (cpi->use_svc) update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+  if (cpi->ppi->use_svc)
+    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
 }
 
 int av1_rc_get_default_min_gf_interval(int width, int height,
@@ -285,49 +287,69 @@
   return AOMMAX(interval, min_gf_interval);
 }
 
-void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+void av1_primary_rc_init(const AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc) {
   const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
-  int i;
 
-  if (pass == 0 && rc_cfg->mode == AOM_CBR) {
-    rc->avg_frame_qindex[KEY_FRAME] = rc_cfg->worst_allowed_q;
-    rc->avg_frame_qindex[INTER_FRAME] = rc_cfg->worst_allowed_q;
-  } else {
-    rc->avg_frame_qindex[KEY_FRAME] =
-        (rc_cfg->worst_allowed_q + rc_cfg->best_allowed_q) / 2;
-    rc->avg_frame_qindex[INTER_FRAME] =
-        (rc_cfg->worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+  int worst_allowed_q = rc_cfg->worst_allowed_q;
+
+  int min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+  int max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+  if (min_gf_interval == 0)
+    min_gf_interval = av1_rc_get_default_min_gf_interval(
+        oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height,
+        oxcf->input_cfg.init_framerate);
+  if (max_gf_interval == 0)
+    max_gf_interval = av1_rc_get_default_max_gf_interval(
+        oxcf->input_cfg.init_framerate, min_gf_interval);
+  p_rc->baseline_gf_interval = (min_gf_interval + max_gf_interval) / 2;
+  p_rc->this_key_frame_forced = 0;
+  p_rc->next_key_frame_forced = 0;
+  p_rc->ni_frames = 0;
+
+  p_rc->tot_q = 0.0;
+  p_rc->total_actual_bits = 0;
+  p_rc->total_target_bits = 0;
+  p_rc->buffer_level = p_rc->starting_buffer_level;
+
+  if (oxcf->target_seq_level_idx[0] < SEQ_LEVELS) {
+    worst_allowed_q = 255;
   }
+  if (oxcf->pass == AOM_RC_ONE_PASS && rc_cfg->mode == AOM_CBR) {
+    p_rc->avg_frame_qindex[KEY_FRAME] = worst_allowed_q;
+    p_rc->avg_frame_qindex[INTER_FRAME] = worst_allowed_q;
+  } else {
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (worst_allowed_q + rc_cfg->best_allowed_q) / 2;
+  }
+  p_rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
+                                        oxcf->tool_cfg.bit_depth);
+  p_rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
+  p_rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
 
-  rc->last_q[KEY_FRAME] = rc_cfg->best_allowed_q;
-  rc->last_q[INTER_FRAME] = rc_cfg->worst_allowed_q;
+  for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+    p_rc->rate_correction_factors[i] = 0.7;
+  }
+  p_rc->rate_correction_factors[KF_STD] = 1.0;
+  p_rc->bits_off_target = p_rc->starting_buffer_level;
 
-  rc->buffer_level = rc->starting_buffer_level;
-  rc->bits_off_target = rc->starting_buffer_level;
+  p_rc->rolling_target_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+  p_rc->rolling_actual_bits =
+      (int)(oxcf->rc_cfg.target_bandwidth / oxcf->input_cfg.init_framerate);
+}
 
-  rc->rolling_target_bits = rc->avg_frame_bandwidth;
-  rc->rolling_actual_bits = rc->avg_frame_bandwidth;
-
-  rc->total_actual_bits = 0;
-  rc->total_target_bits = 0;
+void av1_rc_init(const AV1EncoderConfig *oxcf, RATE_CONTROL *rc) {
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
   rc->frames_since_key = 8;  // Sensible default for first frame.
-  rc->this_key_frame_forced = 0;
-  rc->next_key_frame_forced = 0;
 
   rc->frames_till_gf_update_due = 0;
   rc->ni_av_qi = rc_cfg->worst_allowed_q;
   rc->ni_tot_qi = 0;
-  rc->ni_frames = 0;
 
-  rc->tot_q = 0.0;
-  rc->avg_q = av1_convert_qindex_to_q(rc_cfg->worst_allowed_q,
-                                      oxcf->tool_cfg.bit_depth);
-
-  for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-    rc->rate_correction_factors[i] = 0.7;
-  }
-  rc->rate_correction_factors[KF_STD] = 1.0;
   rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
   rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
   if (rc->min_gf_interval == 0)
@@ -337,33 +359,37 @@
   if (rc->max_gf_interval == 0)
     rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
         oxcf->input_cfg.init_framerate, rc->min_gf_interval);
-  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
   rc->avg_frame_low_motion = 0;
 
   rc->resize_state = ORIG;
   rc->resize_avg_qp = 0;
   rc->resize_buffer_underflow = 0;
   rc->resize_count = 0;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  rc->frame_level_fast_extra_bits = 0;
+#endif
 }
 
 int av1_rc_drop_frame(AV1_COMP *cpi) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int64_t buffer_level = p_rc->buffer_level;
 
   if (!oxcf->rc_cfg.drop_frames_water_mark) {
     return 0;
   } else {
-    if (rc->buffer_level < 0) {
+    if (buffer_level < 0) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark *
-                            rc->optimal_buffer_level / 100);
-      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+                            p_rc->optimal_buffer_level / 100);
+      if ((buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
-      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+      } else if (buffer_level <= drop_mark && rc->decimation_factor == 0) {
         rc->decimation_factor = 1;
       }
       if (rc->decimation_factor > 0) {
@@ -384,8 +410,9 @@
 
 static int adjust_q_cbr(const AV1_COMP *cpi, int q, int active_worst_quality) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1_COMMON *const cm = &cpi->common;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const int max_delta = 16;
   const int change_avg_frame_bandwidth =
       abs(rc->avg_frame_bandwidth - rc->prev_avg_frame_bandwidth) >
@@ -397,11 +424,10 @@
       (cm->width != cm->prev_frame->width ||
        cm->height != cm->prev_frame->height || change_avg_frame_bandwidth);
   // Apply some control/clamp to QP under certain conditions.
-  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->use_svc &&
+  if (cm->current_frame.frame_type != KEY_FRAME && !cpi->ppi->use_svc &&
       rc->frames_since_key > 1 && !change_target_bits_mb &&
       (!cpi->oxcf.rc_cfg.gf_cbr_boost_pct ||
-       !(refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame))) {
+       !(refresh_frame->alt_ref_frame || refresh_frame->golden_frame))) {
     // Make sure q is between oscillating Qs to prevent resonance.
     if (rc->rc_1_frame * rc->rc_2_frame == -1 &&
         rc->q_1_frame != rc->q_2_frame) {
@@ -411,7 +437,7 @@
     // Adjust Q base on source content change from scene detection.
     if (cpi->sf.rt_sf.check_scene_detection && rc->prev_avg_source_sad > 0 &&
         rc->frames_since_key > 10) {
-      const int bit_depth = cm->seq_params.bit_depth;
+      const int bit_depth = cm->seq_params->bit_depth;
       double delta =
           (double)rc->avg_source_sad / (double)rc->prev_avg_source_sad - 1.0;
       // Push Q downwards if content change is decreasing and buffer level
@@ -419,14 +445,15 @@
       // only for high Q to avoid excess overshoot.
       // Else reduce decrease in Q from previous frame if content change is
       // increasing and buffer is below max (so not undershooting).
-      if (delta < 0.0 && rc->buffer_level > (rc->optimal_buffer_level >> 2) &&
+      if (delta < 0.0 &&
+          p_rc->buffer_level > (p_rc->optimal_buffer_level >> 2) &&
           q > (rc->worst_quality >> 1)) {
         double q_adj_factor = 1.0 + 0.5 * tanh(4.0 * delta);
         double q_val = av1_convert_qindex_to_q(q, bit_depth);
         q += av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
       } else if (rc->q_1_frame - q > 0 && delta > 0.1 &&
-                 rc->buffer_level < AOMMIN(rc->maximum_buffer_size,
-                                           rc->optimal_buffer_level << 1)) {
+                 p_rc->buffer_level < AOMMIN(p_rc->maximum_buffer_size,
+                                             p_rc->optimal_buffer_level << 1)) {
         q = (3 * q + rc->q_1_frame) >> 2;
       }
     }
@@ -475,24 +502,55 @@
 static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
                                          int height) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   double rcf;
+  double rate_correction_factors_kfstd;
+  double rate_correction_factors_gfarfstd;
+  double rate_correction_factors_internormal;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  rate_correction_factors_kfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[KF_STD]
+          : p_rc->rate_correction_factors[KF_STD];
+  rate_correction_factors_gfarfstd =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[GF_ARF_STD]
+          : p_rc->rate_correction_factors[GF_ARF_STD];
+  rate_correction_factors_internormal =
+      (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+          ? rc->frame_level_rate_correction_factors[INTER_NORMAL]
+          : p_rc->rate_correction_factors[INTER_NORMAL];
+#else
+  rate_correction_factors_kfstd = p_rc->rate_correction_factors[KF_STD];
+  rate_correction_factors_gfarfstd = p_rc->rate_correction_factors[GF_ARF_STD];
+  rate_correction_factors_internormal =
+      p_rc->rate_correction_factors[INTER_NORMAL];
+#endif
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
-    rcf = rc->rate_correction_factors[KF_STD];
+    rcf = rate_correction_factors_kfstd;
   } else if (is_stat_consumption_stage(cpi)) {
     const RATE_FACTOR_LEVEL rf_lvl =
-        get_rate_factor_level(&cpi->gf_group, cpi->gf_frame_index);
-    rcf = rc->rate_correction_factors[rf_lvl];
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    double rate_correction_factors_rflvl;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    rate_correction_factors_rflvl =
+        (cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0)
+            ? rc->frame_level_rate_correction_factors[rf_lvl]
+            : p_rc->rate_correction_factors[rf_lvl];
+#else
+    rate_correction_factors_rflvl = p_rc->rate_correction_factors[rf_lvl];
+#endif
+    rcf = rate_correction_factors_rflvl;
   } else {
-    if ((refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
          cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
-      rcf = rc->rate_correction_factors[GF_ARF_STD];
+      rcf = rate_correction_factors_gfarfstd;
     else
-      rcf = rc->rate_correction_factors[INTER_NORMAL];
+      rcf = rate_correction_factors_internormal;
   }
   rcf *= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
   return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
@@ -513,36 +571,58 @@
  * \return None but updates the rate correction factor for the
  *         current frame type in cpi->rc.
  */
-static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
-                                       int height) {
+static void set_rate_correction_factor(AV1_COMP *cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                       int is_encode_stage,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                                       double factor, int width, int height) {
   RATE_CONTROL *const rc = &cpi->rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  int update_default_rcf = 1;
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= resize_rate_factor(&cpi->oxcf.frm_dim_cfg, width, height);
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
-    rc->rate_correction_factors[KF_STD] = factor;
+    p_rc->rate_correction_factors[KF_STD] = factor;
   } else if (is_stat_consumption_stage(cpi)) {
     const RATE_FACTOR_LEVEL rf_lvl =
-        get_rate_factor_level(&cpi->gf_group, cpi->gf_frame_index);
-    rc->rate_correction_factors[rf_lvl] = factor;
+        get_rate_factor_level(&cpi->ppi->gf_group, cpi->gf_frame_index);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (is_encode_stage &&
+        cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+      rc->frame_level_rate_correction_factors[rf_lvl] = factor;
+      update_default_rcf = 0;
+    }
+#endif
+    if (update_default_rcf) p_rc->rate_correction_factors[rf_lvl] = factor;
   } else {
-    if ((refresh_frame_flags->alt_ref_frame ||
-         refresh_frame_flags->golden_frame) &&
-        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+    if ((refresh_frame->alt_ref_frame || refresh_frame->golden_frame) &&
+        !rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
         (cpi->oxcf.rc_cfg.mode != AOM_CBR ||
-         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20))
-      rc->rate_correction_factors[GF_ARF_STD] = factor;
-    else
-      rc->rate_correction_factors[INTER_NORMAL] = factor;
+         cpi->oxcf.rc_cfg.gf_cbr_boost_pct > 20)) {
+      p_rc->rate_correction_factors[GF_ARF_STD] = factor;
+    } else {
+#if CONFIG_FRAME_PARALLEL_ENCODE
+      if (is_encode_stage &&
+          cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0) {
+        rc->frame_level_rate_correction_factors[INTER_NORMAL] = factor;
+        update_default_rcf = 0;
+      }
+#endif
+      if (update_default_rcf)
+        p_rc->rate_correction_factors[INTER_NORMAL] = factor;
+    }
   }
 }
 
-void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
-                                           int height) {
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                           int is_encode_stage,
+#endif
+                                           int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor =
@@ -556,7 +636,6 @@
   if (cpi->rc.is_src_frame_alt_ref) return;
 
   // Clear down mmx registers to allow floating point in what follows
-  aom_clear_system_state();
 
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
@@ -567,7 +646,7 @@
   } else {
     projected_size_based_on_q = av1_estimate_bits_at_q(
         cm->current_frame.frame_type, cm->quant_params.base_qindex, MBs,
-        rate_correction_factor, cm->seq_params.bit_depth,
+        rate_correction_factor, cm->seq_params->bit_depth,
         cpi->is_screen_content_type);
   }
   // Work out a size correction factor.
@@ -613,7 +692,11 @@
       rate_correction_factor = MIN_BPB_FACTOR;
   }
 
-  set_rate_correction_factor(cpi, rate_correction_factor, width, height);
+  set_rate_correction_factor(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                             is_encode_stage,
+#endif
+                             rate_correction_factor, width, height);
 }
 
 // Calculate rate for the given 'q'.
@@ -623,7 +706,7 @@
   return use_cyclic_refresh
              ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
              : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
-                                  correction_factor, cm->seq_params.bit_depth,
+                                  correction_factor, cm->seq_params->bit_depth,
                                   cpi->is_screen_content_type);
 }
 
@@ -727,26 +810,31 @@
   }
 }
 
-static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+static int get_kf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
                                  aom_bit_depth_t bit_depth) {
   int *kf_low_motion_minq;
   int *kf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
-  return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+  return get_active_quality(q, p_rc->kf_boost, kf_low, kf_high,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
-                                 aom_bit_depth_t bit_depth) {
+static int get_gf_active_quality_no_rc(int gfu_boost, int q,
+                                       aom_bit_depth_t bit_depth) {
   int *arfgf_low_motion_minq;
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
-  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
+static int get_gf_active_quality(const PRIMARY_RATE_CONTROL *const p_rc, int q,
+                                 aom_bit_depth_t bit_depth) {
+  return get_gf_active_quality_no_rc(p_rc->gfu_boost, q, bit_depth);
+}
+
 static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
   int *arfgf_high_motion_minq;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
@@ -755,22 +843,27 @@
 
 static int calc_active_worst_quality_no_stats_vbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const unsigned int curr_frame = cpi->common.current_frame.frame_number;
   int active_worst_quality;
+  int last_q_key_frame;
+  int last_q_inter_frame;
+  last_q_key_frame = p_rc->last_q[KEY_FRAME];
+  last_q_inter_frame = p_rc->last_q[INTER_FRAME];
 
   if (cpi->common.current_frame.frame_type == KEY_FRAME) {
     active_worst_quality =
-        curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+        curr_frame == 0 ? rc->worst_quality : last_q_key_frame * 2;
   } else {
-    if (!rc->is_src_frame_alt_ref && (refresh_frame_flags->golden_frame ||
-                                      refresh_frame_flags->bwd_ref_frame ||
-                                      refresh_frame_flags->alt_ref_frame)) {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
-                                             : rc->last_q[INTER_FRAME];
+    if (!rc->is_src_frame_alt_ref &&
+        (refresh_frame->golden_frame || refresh_frame->bwd_ref_frame ||
+         refresh_frame->alt_ref_frame)) {
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 5 / 4 : last_q_inter_frame;
     } else {
-      active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
-                                             : rc->last_q[INTER_FRAME] * 2;
+      active_worst_quality =
+          curr_frame == 1 ? last_q_key_frame * 2 : last_q_inter_frame * 2;
     }
   }
   return AOMMIN(active_worst_quality, rc->worst_quality);
@@ -785,8 +878,11 @@
   // (at buffer = critical level).
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+  const SVC *const svc = &cpi->svc;
+  unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = rc->optimal_buffer_level >> 3;
+  int64_t critical_level = p_rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
@@ -796,31 +892,42 @@
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
   // So for first few frames following key, the qp of that key frame is weighted
-  // into the active_worst_quality setting.
-  ambient_qp = (cm->current_frame.frame_number < 5)
-                   ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
-                            rc->avg_frame_qindex[KEY_FRAME])
-                   : rc->avg_frame_qindex[INTER_FRAME];
+  // into the active_worst_quality setting. For SVC the key frame should
+  // correspond to layer (0, 0), so use that for layer context.
+  int avg_qindex_key = p_rc->avg_frame_qindex[KEY_FRAME];
+  if (svc->number_temporal_layers > 1) {
+    int layer = LAYER_IDS_TO_IDX(0, 0, svc->number_temporal_layers);
+    const LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
+    avg_qindex_key = lp_rc->avg_frame_qindex[KEY_FRAME];
+    if (svc->temporal_layer_id == 0)
+      avg_qindex_key =
+          AOMMIN(lp_rc->avg_frame_qindex[KEY_FRAME], lp_rc->last_q[KEY_FRAME]);
+  }
+  ambient_qp = (cm->current_frame.frame_number < num_frames_weight_key)
+                   ? AOMMIN(p_rc->avg_frame_qindex[INTER_FRAME], avg_qindex_key)
+                   : p_rc->avg_frame_qindex[INTER_FRAME];
   active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
-  if (rc->buffer_level > rc->optimal_buffer_level) {
+  if (p_rc->buffer_level > p_rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
     int max_adjustment_down = active_worst_quality / 3;
     if (max_adjustment_down) {
-      buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
-                       max_adjustment_down);
+      buff_lvl_step =
+          ((p_rc->maximum_buffer_size - p_rc->optimal_buffer_level) /
+           max_adjustment_down);
       if (buff_lvl_step)
-        adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+        adjustment = (int)((p_rc->buffer_level - p_rc->optimal_buffer_level) /
                            buff_lvl_step);
       active_worst_quality -= adjustment;
     }
-  } else if (rc->buffer_level > critical_level) {
+  } else if (p_rc->buffer_level > critical_level) {
     // Adjust up from ambient Q.
     if (critical_level) {
-      buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+      buff_lvl_step = (p_rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
         adjustment = (int)((rc->worst_quality - ambient_qp) *
-                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           (p_rc->optimal_buffer_level - p_rc->buffer_level) /
                            buff_lvl_step);
       }
       active_worst_quality = ambient_qp + adjustment;
@@ -838,10 +945,11 @@
                                                  int width, int height) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int *rtc_minq;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   int active_best_quality = rc->best_quality;
   ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
 
@@ -849,8 +957,8 @@
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
-      int qindex = rc->last_boosted_qindex;
+    if (p_rc->this_key_frame_forced) {
+      int qindex = p_rc->last_boosted_qindex;
       double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
                                             (last_boosted_q * 0.75), bit_depth);
@@ -859,8 +967,8 @@
       // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
       double q_val;
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
         q_adj_factor -= 0.25;
@@ -871,25 +979,24 @@
       active_best_quality +=
           av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
     }
-  } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+  } else if (!rc->is_src_frame_alt_ref && !cpi->ppi->use_svc &&
              cpi->oxcf.rc_cfg.gf_cbr_boost_pct &&
-             (refresh_frame_flags->golden_frame ||
-              refresh_frame_flags->alt_ref_frame)) {
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
     int q = active_worst_quality;
     if (rc->frames_since_key > 1 &&
-        rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-      q = rc->avg_frame_qindex[INTER_FRAME];
+        p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+      q = p_rc->avg_frame_qindex[INTER_FRAME];
     }
-    active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+    active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     FRAME_TYPE frame_type =
         (current_frame->frame_number > 1) ? INTER_FRAME : KEY_FRAME;
-    if (rc->avg_frame_qindex[frame_type] < active_worst_quality)
-      active_best_quality = rtc_minq[rc->avg_frame_qindex[frame_type]];
+    if (p_rc->avg_frame_qindex[frame_type] < active_worst_quality)
+      active_best_quality = rtc_minq[p_rc->avg_frame_qindex[frame_type]];
     else
       active_best_quality = rtc_minq[active_worst_quality];
   }
@@ -916,9 +1023,10 @@
                                              int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   int q;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   int active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
   int active_best_quality = calc_active_best_quality_no_stats_cbr(
       cpi, active_worst_quality, width, height);
@@ -935,10 +1043,9 @@
   *bottom_index = active_best_quality;
 
   // Limit Q range for the adaptive loop.
-  if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+  if (current_frame->frame_type == KEY_FRAME && !p_rc->this_key_frame_forced &&
       current_frame->frame_number != 0) {
     int qdelta = 0;
-    aom_clear_system_state();
     qdelta = av1_compute_qdelta_by_rate(&cpi->rc, current_frame->frame_type,
                                         active_worst_quality, 2.0,
                                         cpi->is_screen_content_type, bit_depth);
@@ -947,8 +1054,8 @@
   }
 
   // Special case code to try and match quality with forced key frames
-  if (current_frame->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
-    q = rc->last_boosted_qindex;
+  if (current_frame->frame_type == KEY_FRAME && p_rc->this_key_frame_forced) {
+    q = p_rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality, width, height);
@@ -973,6 +1080,7 @@
 }
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
+                               const PRIMARY_RATE_CONTROL *p_rc,
                                const AV1EncoderConfig *const oxcf,
                                int intra_only, aom_superres_mode superres_mode,
                                int superres_denom) {
@@ -997,8 +1105,8 @@
           active_cq_level - ((superres_denom - SCALE_NUMERATOR) * mult), 0);
     }
   }
-  if (rc_cfg->mode == AOM_CQ && rc->total_target_bits > 0) {
-    const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+  if (rc_cfg->mode == AOM_CQ && p_rc->total_target_bits > 0) {
+    const double x = (double)p_rc->total_actual_bits / p_rc->total_target_bits;
     if (x < cq_adjust_threshold) {
       active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
     }
@@ -1006,62 +1114,6 @@
   return active_cq_level;
 }
 
-/*! \brief Pick q index for this frame using fixed q index offsets.
- *
- * The q index offsets are fixed in the sense that they are independent of the
- * video content. The offsets for each pyramid level are taken from
- * \c oxcf->q_cfg.fixed_qp_offsets array.
- *
- * \ingroup rate_control
- * \param[in]   oxcf        Top level encoder configuration
- * \param[in]   rc          Top level rate control structure
- * \param[in]   gf_group    Configuration of current golden frame group
- * \param[in]   gf_index    Index of this frame in the golden frame group
- * \param[in]   cq_level    Upper bound for q index (this may be same as
- *                          \c oxcf->cq_level, or slightly modified for some
- *                          special cases)
- * \param[in]   bit_depth   Bit depth of the codec (same as
- *                          \c cm->seq_params.bit_depth)
- * \return Returns selected q index to be used for encoding this frame.
- */
-static int get_q_using_fixed_offsets(const AV1EncoderConfig *const oxcf,
-                                     const RATE_CONTROL *const rc,
-                                     const GF_GROUP *const gf_group,
-                                     int gf_index, int cq_level,
-                                     int bit_depth) {
-  assert(oxcf->q_cfg.use_fixed_qp_offsets);
-  assert(oxcf->rc_cfg.mode == AOM_Q);
-  const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_index];
-
-  int offset_idx = -1;
-  if (update_type == KF_UPDATE) {
-    if (rc->frames_to_key <= 1) {
-      // Image / intra-only coding: ignore offsets.
-      return cq_level;
-    }
-    offset_idx = 0;
-  } else if (update_type == ARF_UPDATE || update_type == GF_UPDATE) {
-    offset_idx = 1;
-  } else if (update_type == INTNL_ARF_UPDATE) {
-    offset_idx =
-        AOMMIN(gf_group->layer_depth[gf_index], FIXED_QP_OFFSET_COUNT - 1);
-  } else {  // Leaf level / overlay frame.
-    assert(update_type == LF_UPDATE || update_type == OVERLAY_UPDATE ||
-           update_type == INTNL_OVERLAY_UPDATE);
-    return cq_level;  // Directly Return worst quality allowed.
-  }
-  assert(offset_idx >= 0 && offset_idx < FIXED_QP_OFFSET_COUNT);
-  assert(oxcf->q_cfg.fixed_qp_offsets[offset_idx] >= 0);
-
-  // Get qindex offset, by first converting to 'q' and then back.
-  const double q_val_orig = av1_convert_qindex_to_q(cq_level, bit_depth);
-  const double q_val_target =
-      AOMMAX(q_val_orig - oxcf->q_cfg.fixed_qp_offsets[offset_idx], 0.0);
-  const int delta_qindex =
-      av1_compute_qdelta(rc, q_val_orig, q_val_target, bit_depth);
-  return AOMMAX(cq_level + delta_qindex, 0);
-}
-
 /*!\brief Picks q and q bounds given non-CBR rate control params in \c cpi->rc.
  *
  * Handles the special case when using:
@@ -1074,38 +1126,30 @@
  * \param[in]       cpi          Top level encoder structure
  * \param[in]       width        Coded frame width
  * \param[in]       height       Coded frame height
- * \param[in]       gf_index     Index of this frame in the golden frame group
  * \param[out]      bottom_index Bottom bound for q index (best quality)
  * \param[out]      top_index    Top bound for q index (worst quality)
  * \return Returns selected q index to be used for encoding this frame.
  */
 static int rc_pick_q_and_bounds_no_stats(const AV1_COMP *cpi, int width,
-                                         int height, int gf_index,
-                                         int *bottom_index, int *top_index) {
+                                         int height, int *bottom_index,
+                                         int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const CurrentFrame *const current_frame = &cm->current_frame;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
 
   assert(has_no_stats_stage(cpi));
   assert(rc_mode == AOM_VBR ||
          (!USE_UNRESTRICTED_Q_IN_CQ_MODE && rc_mode == AOM_CQ) ||
          rc_mode == AOM_Q);
-  assert(
-      IMPLIES(rc_mode == AOM_Q, gf_group->update_type[gf_index] == ARF_UPDATE));
 
   const int cq_level =
-      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
-                          cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
-
-  if (oxcf->q_cfg.use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, gf_group, gf_index, cq_level,
-                                     bit_depth);
-  }
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  const int bit_depth = cm->seq_params->bit_depth;
 
   int active_best_quality;
   int active_worst_quality = calc_active_worst_quality_no_stats_vbr(cpi);
@@ -1120,8 +1164,8 @@
       const int delta_qindex =
           av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-    } else if (rc->this_key_frame_forced) {
-      const int qindex = rc->last_boosted_qindex;
+    } else if (p_rc->this_key_frame_forced) {
+      int qindex = p_rc->last_boosted_qindex;
       const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex = av1_compute_qdelta(
           rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
@@ -1129,8 +1173,8 @@
     } else {  // not first frame of one pass and kf_boost is set
       double q_adj_factor = 1.0;
 
-      active_best_quality =
-          get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+      active_best_quality = get_kf_active_quality(
+          p_rc, p_rc->avg_frame_qindex[KEY_FRAME], bit_depth);
 
       // Allow somewhat lower kf minq with small image formats.
       if ((width * height) <= (352 * 288)) {
@@ -1146,31 +1190,30 @@
       }
     }
   } else if (!rc->is_src_frame_alt_ref &&
-             (refresh_frame_flags->golden_frame ||
-              refresh_frame_flags->alt_ref_frame)) {
+             (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
     q = (rc->frames_since_key > 1 &&
-         rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
-            ? rc->avg_frame_qindex[INTER_FRAME]
-            : rc->avg_frame_qindex[KEY_FRAME];
+         p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+            ? p_rc->avg_frame_qindex[INTER_FRAME]
+            : p_rc->avg_frame_qindex[KEY_FRAME];
     // For constrained quality dont allow Q less than the cq level
     if (rc_mode == AOM_CQ) {
       if (q < cq_level) q = cq_level;
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
     } else if (rc_mode == AOM_Q) {
       const int qindex = cq_level;
       const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
       const int delta_qindex =
-          (refresh_frame_flags->alt_ref_frame)
+          (refresh_frame->alt_ref_frame)
               ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
               : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+      active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
     }
   } else {
     if (rc_mode == AOM_Q) {
@@ -1185,9 +1228,10 @@
       active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       // Use the lower of active_worst_quality and recent/average Q.
-      active_best_quality = (current_frame->frame_number > 1)
-                                ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
-                                : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+      active_best_quality =
+          (current_frame->frame_number > 1)
+              ? inter_minq[p_rc->avg_frame_qindex[INTER_FRAME]]
+              : inter_minq[p_rc->avg_frame_qindex[KEY_FRAME]];
       // For the constrained quality mode we don't want
       // q to fall below the cq level.
       if ((rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
@@ -1208,15 +1252,13 @@
   // Limit Q range for the adaptive loop.
   {
     int qdelta = 0;
-    aom_clear_system_state();
-    if (current_frame->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-        current_frame->frame_number != 0) {
+    if (current_frame->frame_type == KEY_FRAME &&
+        !p_rc->this_key_frame_forced && current_frame->frame_number != 0) {
       qdelta = av1_compute_qdelta_by_rate(
           &cpi->rc, current_frame->frame_type, active_worst_quality, 2.0,
           cpi->is_screen_content_type, bit_depth);
     } else if (!rc->is_src_frame_alt_ref &&
-               (refresh_frame_flags->golden_frame ||
-                refresh_frame_flags->alt_ref_frame)) {
+               (refresh_frame->golden_frame || refresh_frame->alt_ref_frame)) {
       qdelta = av1_compute_qdelta_by_rate(
           &cpi->rc, current_frame->frame_type, active_worst_quality, 1.75,
           cpi->is_screen_content_type, bit_depth);
@@ -1229,8 +1271,8 @@
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames
   } else if ((current_frame->frame_type == KEY_FRAME) &&
-             rc->this_key_frame_forced) {
-    q = rc->last_boosted_qindex;
+             p_rc->this_key_frame_forced) {
+    q = p_rc->last_boosted_qindex;
   } else {
     q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality, width, height);
@@ -1254,7 +1296,7 @@
                                                              1.50, 1.25, 1.15,
                                                              1.0 };
 int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const RATE_FACTOR_LEVEL rf_lvl =
       get_rate_factor_level(gf_group, cpi->gf_frame_index);
   const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
@@ -1264,7 +1306,7 @@
 
   return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, rate_factor,
                                     cpi->is_screen_content_type,
-                                    cpi->common.seq_params.bit_depth);
+                                    cpi->common.seq_params->bit_depth);
 }
 
 // This unrestricted Q selection on CQ mode is useful when testing new features,
@@ -1279,7 +1321,7 @@
   const int cq_level =
       get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
                           cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const int q = (int)av1_convert_qindex_to_q(cq_level, bit_depth);
   (void)width;
   (void)height;
@@ -1296,13 +1338,14 @@
 #define STATIC_MOTION_THRESH 95
 static void get_intra_q_and_bounds(const AV1_COMP *cpi, int width, int height,
                                    int *active_best, int *active_worst,
-                                   int cq_level, int is_fwd_kf) {
+                                   int cq_level) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   int active_best_quality;
   int active_worst_quality = *active_worst;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
 
   if (rc->frames_to_key <= 1 && oxcf->rc_cfg.mode == AOM_Q) {
     // If the next frame is also a key frame or the current frame is the
@@ -1310,26 +1353,17 @@
     // as q.
     active_best_quality = cq_level;
     active_worst_quality = cq_level;
-  } else if (is_fwd_kf) {
-    // Handle the special case for forward reference key frames.
-    // Increase the boost because this keyframe is used as a forward and
-    // backward reference.
-    const int qindex = rc->last_boosted_qindex;
-    const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
-    const int delta_qindex = av1_compute_qdelta(
-        rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
-    active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
-  } else if (rc->this_key_frame_forced) {
+  } else if (p_rc->this_key_frame_forced) {
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
     double last_boosted_q;
     int delta_qindex;
     int qindex;
-
+    int last_boosted_qindex = p_rc->last_boosted_qindex;
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+        cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
       active_best_quality = qindex;
       last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
@@ -1337,7 +1371,7 @@
       active_worst_quality =
           AOMMIN(qindex + delta_qindex, active_worst_quality);
     } else {
-      qindex = rc->last_boosted_qindex;
+      qindex = last_boosted_qindex;
       last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
       delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
                                         last_boosted_q * 0.50, bit_depth);
@@ -1350,13 +1384,13 @@
 
     // Baseline value derived from cpi->active_worst_quality and kf boost.
     active_best_quality =
-        get_kf_active_quality(rc, active_worst_quality, bit_depth);
+        get_kf_active_quality(p_rc, active_worst_quality, bit_depth);
     if (cpi->is_screen_content_type) {
       active_best_quality /= 2;
     }
 
     if (is_stat_consumption_stage_twopass(cpi) &&
-        cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+        cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
       active_best_quality /= 3;
     }
 
@@ -1367,7 +1401,8 @@
 
     // Make a further adjustment based on the kf zero motion measure.
     if (is_stat_consumption_stage_twopass(cpi))
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+      q_adj_factor +=
+          0.05 - (0.001 * (double)cpi->ppi->twopass.kf_zeromotion_pct);
 
     // Convert the adjustment factor to a qindex delta
     // on active_best_quality.
@@ -1398,8 +1433,9 @@
                                                  int *active_best) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const int bit_depth = cpi->common.seq_params.bit_depth;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const int bit_depth = cpi->common.seq_params->bit_depth;
   int active_best_quality = *active_best;
   int active_worst_quality = *active_worst;
   // Extension to max or min Q if undershoot or overshoot is outside
@@ -1407,23 +1443,23 @@
   if (cpi->oxcf.rc_cfg.mode != AOM_Q) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
-         (refresh_frame_flags->golden_frame || is_intrl_arf_boost ||
-          refresh_frame_flags->alt_ref_frame))) {
+         (refresh_frame->golden_frame || is_intrl_arf_boost ||
+          refresh_frame->alt_ref_frame))) {
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
-      active_worst_quality += (cpi->twopass.extend_maxq / 2);
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast);
+      active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2);
     } else {
       active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
-      active_worst_quality += cpi->twopass.extend_maxq;
+          (cpi->ppi->twopass.extend_minq + cpi->ppi->twopass.extend_minq_fast) /
+          2;
+      active_worst_quality += cpi->ppi->twopass.extend_maxq;
     }
   }
 
-  aom_clear_system_state();
 #ifndef STRICT_RC
   // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
-      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+  if (!(frame_is_intra_only(cm)) || !p_rc->this_key_frame_forced ||
+      (cpi->ppi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
     const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
@@ -1468,20 +1504,22 @@
                  const int active_best_quality) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
+  int last_boosted_qindex = p_rc->last_boosted_qindex;
 
   if (cpi->oxcf.rc_cfg.mode == AOM_Q ||
-      (frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
-       cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+      (frame_is_intra_only(cm) && !p_rc->this_key_frame_forced &&
+       cpi->ppi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
        rc->frames_to_key > 1)) {
     q = active_best_quality;
     // Special case code to try and match quality with forced key frames.
-  } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+  } else if (frame_is_intra_only(cm) && p_rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
-    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-      q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+    if (cpi->ppi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      q = AOMMIN(p_rc->last_kf_qindex, last_boosted_qindex);
     } else {
-      q = AOMMIN(rc->last_boosted_qindex,
+      q = AOMMIN(last_boosted_qindex,
                  (active_best_quality + active_worst_quality) / 2);
     }
     q = clamp(q, active_best_quality, active_worst_quality);
@@ -1508,20 +1546,28 @@
                                    const int active_worst_quality,
                                    const int cq_level, const int gf_index) {
   const AV1_COMMON *const cm = &cpi->common;
-  const int bit_depth = cm->seq_params.bit_depth;
+  const int bit_depth = cm->seq_params->bit_depth;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const enum aom_rc_mode rc_mode = oxcf->rc_cfg.mode;
   int *inter_minq;
   ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
   int active_best_quality = 0;
   const int is_intrl_arf_boost =
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
-  const int is_leaf_frame =
-      !(refresh_frame_flags->golden_frame ||
-        refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost);
+  int is_leaf_frame =
+      !(gf_group->update_type[gf_index] == ARF_UPDATE ||
+        gf_group->update_type[gf_index] == GF_UPDATE || is_intrl_arf_boost);
+
+  // TODO(jingning): Consider to rework this hack that covers issues incurred
+  // in lightfield setting.
+  if (cm->tiles.large_scale) {
+    is_leaf_frame = !(refresh_frame->golden_frame ||
+                      refresh_frame->alt_ref_frame || is_intrl_arf_boost);
+  }
   const int is_overlay_frame = rc->is_src_frame_alt_ref;
 
   if (is_leaf_frame || is_overlay_frame) {
@@ -1536,31 +1582,25 @@
     return active_best_quality;
   }
 
-  // TODO(chengchen): can we remove this condition?
-  if (rc_mode == AOM_Q && !refresh_frame_flags->alt_ref_frame &&
-      !refresh_frame_flags->golden_frame && !is_intrl_arf_boost) {
-    return cq_level;
-  }
-
   // Determine active_best_quality for frames that are not leaf or overlay.
   int q = active_worst_quality;
   // Use the lower of active_worst_quality and recent
   // average Q as basis for GF/ARF best Q limit unless last frame was
   // a key frame.
   if (rc->frames_since_key > 1 &&
-      rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
-    q = rc->avg_frame_qindex[INTER_FRAME];
+      p_rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+    q = p_rc->avg_frame_qindex[INTER_FRAME];
   }
   if (rc_mode == AOM_CQ && q < cq_level) q = cq_level;
-  active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+  active_best_quality = get_gf_active_quality(p_rc, q, bit_depth);
   // Constrained quality use slightly lower active best.
   if (rc_mode == AOM_CQ) active_best_quality = active_best_quality * 15 / 16;
   const int min_boost = get_gf_high_motion_quality(q, bit_depth);
   const int boost = min_boost - active_best_quality;
-  active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+  active_best_quality = min_boost - (int)(boost * p_rc->arf_boost_factor);
   if (!is_intrl_arf_boost) return active_best_quality;
 
-  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = rc->arf_q;
+  if (rc_mode == AOM_Q || rc_mode == AOM_CQ) active_best_quality = p_rc->arf_q;
   int this_height = gf_group_pyramid_level(gf_group, gf_index);
   while (this_height > 1) {
     active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
@@ -1569,6 +1609,84 @@
   return active_best_quality;
 }
 
+// Returns the q_index for a single frame in the GOP.
+// This function assumes that rc_mode == AOM_Q mode.
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q) {
+  const int is_intrl_arf_boost = gf_update_type == INTNL_ARF_UPDATE;
+  int is_leaf_or_overlay_frame = gf_update_type == LF_UPDATE ||
+                                 gf_update_type == OVERLAY_UPDATE ||
+                                 gf_update_type == INTNL_OVERLAY_UPDATE;
+
+  if (is_leaf_or_overlay_frame) return base_q_index;
+
+  if (!is_intrl_arf_boost) return arf_q;
+
+  int active_best_quality = arf_q;
+  int active_worst_quality = base_q_index;
+
+  while (gf_pyramid_level > 1) {
+    active_best_quality = (active_best_quality + active_worst_quality + 1) / 2;
+    --gf_pyramid_level;
+  }
+  return active_best_quality;
+}
+
+// Returns the q_index for the ARF in the GOP.
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor) {
+  int active_best_quality =
+      get_gf_active_quality_no_rc(gfu_boost, base_q_index, bit_depth);
+  const int min_boost = get_gf_high_motion_quality(base_q_index, bit_depth);
+  const int boost = min_boost - active_best_quality;
+  return min_boost - (int)(boost * arf_boost_factor);
+}
+
+static int rc_pick_q_and_bounds_q_mode(const AV1_COMP *cpi, int width,
+                                       int height, int gf_index,
+                                       int *bottom_index, int *top_index) {
+  const AV1_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+  const int cq_level =
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
+  int active_best_quality = 0;
+  int active_worst_quality = rc->active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
+                           &active_worst_quality, cq_level);
+  } else {
+    //  Active best quality limited by previous layer.
+    active_best_quality =
+        get_active_best_quality(cpi, active_worst_quality, cq_level, gf_index);
+  }
+
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+  *top_index = AOMMAX(*top_index, rc->best_quality);
+  *top_index = AOMMIN(*top_index, rc->worst_quality);
+
+  *bottom_index = AOMMAX(*bottom_index, rc->best_quality);
+  *bottom_index = AOMMIN(*bottom_index, rc->worst_quality);
+
+  q = active_best_quality;
+
+  q = AOMMAX(q, rc->best_quality);
+  q = AOMMIN(q, rc->worst_quality);
+
+  assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+  assert(*bottom_index <= rc->worst_quality &&
+         *bottom_index >= rc->best_quality);
+  assert(q <= rc->worst_quality && q >= rc->best_quality);
+
+  return q;
+}
+
 /*!\brief Picks q and q bounds given rate control parameters in \c cpi->rc.
  *
  * Handles the the general cases not covered by
@@ -1589,20 +1707,20 @@
                                 int *top_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   assert(IMPLIES(has_no_stats_stage(cpi),
                  cpi->oxcf.rc_cfg.mode == AOM_Q &&
                      gf_group->update_type[gf_index] != ARF_UPDATE));
   const int cq_level =
-      get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), cpi->superres_mode,
-                          cm->superres_scale_denominator);
-  const int bit_depth = cm->seq_params.bit_depth;
+      get_active_cq_level(rc, p_rc, oxcf, frame_is_intra_only(cm),
+                          cpi->superres_mode, cm->superres_scale_denominator);
 
-  if (oxcf->q_cfg.use_fixed_qp_offsets) {
-    return get_q_using_fixed_offsets(oxcf, rc, gf_group, cpi->gf_frame_index,
-                                     cq_level, bit_depth);
+  if (oxcf->rc_cfg.mode == AOM_Q) {
+    return rc_pick_q_and_bounds_q_mode(cpi, width, height, gf_index,
+                                       bottom_index, top_index);
   }
 
   int active_best_quality = 0;
@@ -1613,10 +1731,8 @@
       gf_group->update_type[gf_index] == INTNL_ARF_UPDATE;
 
   if (frame_is_intra_only(cm)) {
-    const int is_fwd_kf = cm->current_frame.frame_type == KEY_FRAME &&
-                          cm->show_frame == 0 && cpi->no_show_fwd_kf;
     get_intra_q_and_bounds(cpi, width, height, &active_best_quality,
-                           &active_worst_quality, cq_level, is_fwd_kf);
+                           &active_worst_quality, cq_level);
 #ifdef STRICT_RC
     active_best_quality = 0;
 #endif
@@ -1624,12 +1740,11 @@
     //  Active best quality limited by previous layer.
     const int pyramid_level = gf_group_pyramid_level(gf_group, gf_index);
 
-    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS) ||
-        (oxcf->rc_cfg.mode == AOM_Q)) {
+    if ((pyramid_level <= 1) || (pyramid_level > MAX_ARF_LAYERS)) {
       active_best_quality = get_active_best_quality(cpi, active_worst_quality,
                                                     cq_level, gf_index);
     } else {
-      active_best_quality = rc->active_best_quality[pyramid_level - 1] + 1;
+      active_best_quality = p_rc->active_best_quality[pyramid_level - 1] + 1;
       active_best_quality = AOMMIN(active_best_quality, active_worst_quality);
 #ifdef STRICT_RC
       active_best_quality += (active_worst_quality - active_best_quality) / 16;
@@ -1644,8 +1759,8 @@
     // leaf (non arf) frames. This is important to the TPL model which assumes
     // Q drops with each arf level.
     if (!(rc->is_src_frame_alt_ref) &&
-        (refresh_frame_flags->golden_frame ||
-         refresh_frame_flags->alt_ref_frame || is_intrl_arf_boost)) {
+        (refresh_frame->golden_frame || refresh_frame->alt_ref_frame ||
+         is_intrl_arf_boost)) {
       active_worst_quality =
           (active_best_quality + (3 * active_worst_quality) + 2) / 4;
     }
@@ -1672,13 +1787,13 @@
   return q;
 }
 
-int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, RATE_CONTROL *rc, int width,
-                             int height, int gf_index, int *bottom_index,
-                             int *top_index) {
+int av1_rc_pick_q_and_bounds(const AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index) {
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int q;
   // TODO(sarahparker) merge no-stats vbr and altref q computation
   // with rc_pick_q_and_bounds().
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   if ((cpi->oxcf.rc_cfg.mode != AOM_Q ||
        gf_group->update_type[gf_index] == ARF_UPDATE) &&
       has_no_stats_stage(cpi)) {
@@ -1691,14 +1806,14 @@
                                            top_index);
 #endif  // USE_UNRESTRICTED_Q_IN_CQ_MODE
     } else {
-      q = rc_pick_q_and_bounds_no_stats(cpi, width, height, gf_index,
-                                        bottom_index, top_index);
+      q = rc_pick_q_and_bounds_no_stats(cpi, width, height, bottom_index,
+                                        top_index);
     }
   } else {
     q = rc_pick_q_and_bounds(cpi, width, height, gf_index, bottom_index,
                              top_index);
   }
-  if (gf_group->update_type[gf_index] == ARF_UPDATE) rc->arf_q = q;
+  if (gf_group->update_type[gf_index] == ARF_UPDATE) p_rc->arf_q = q;
 
   return q;
 }
@@ -1760,8 +1875,9 @@
   const AV1_COMMON *const cm = &cpi->common;
   const CurrentFrame *const current_frame = &cm->current_frame;
   RATE_CONTROL *const rc = &cpi->rc;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
-  const RefreshFrameFlagsInfo *const refresh_frame_flags = &cpi->refresh_frame;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame;
 
   const int is_intrnl_arf =
       gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
@@ -1772,44 +1888,47 @@
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                        0,
+#endif
+                                        cm->width, cm->height);
 
   // Keep a record of last Q and ambient average Q.
   if (current_frame->frame_type == KEY_FRAME) {
-    rc->last_q[KEY_FRAME] = qindex;
-    rc->avg_frame_qindex[KEY_FRAME] =
-        ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+    p_rc->last_q[KEY_FRAME] = qindex;
+    p_rc->avg_frame_qindex[KEY_FRAME] =
+        ROUND_POWER_OF_TWO(3 * p_rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
   } else {
-    if ((cpi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
+    if ((cpi->ppi->use_svc && cpi->oxcf.rc_cfg.mode == AOM_CBR) ||
         (!rc->is_src_frame_alt_ref &&
-         !(refresh_frame_flags->golden_frame || is_intrnl_arf ||
-           refresh_frame_flags->alt_ref_frame))) {
-      rc->last_q[INTER_FRAME] = qindex;
-      rc->avg_frame_qindex[INTER_FRAME] =
-          ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
-      rc->ni_frames++;
-      rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
-      rc->avg_q = rc->tot_q / rc->ni_frames;
+         !(refresh_frame->golden_frame || is_intrnl_arf ||
+           refresh_frame->alt_ref_frame))) {
+      p_rc->last_q[INTER_FRAME] = qindex;
+      p_rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(
+          3 * p_rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+      p_rc->ni_frames++;
+      p_rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params->bit_depth);
+      p_rc->avg_q = p_rc->tot_q / p_rc->ni_frames;
       // Calculate the average Q for normal inter frames (not key or GFU
       // frames).
       rc->ni_tot_qi += qindex;
-      rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+      rc->ni_av_qi = rc->ni_tot_qi / p_rc->ni_frames;
     }
   }
-
   // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
-  if ((qindex < rc->last_boosted_qindex) ||
+  if ((qindex < p_rc->last_boosted_qindex) ||
       (current_frame->frame_type == KEY_FRAME) ||
-      (!rc->constrained_gf_group &&
-       (refresh_frame_flags->alt_ref_frame || is_intrnl_arf ||
-        (refresh_frame_flags->golden_frame && !rc->is_src_frame_alt_ref)))) {
-    rc->last_boosted_qindex = qindex;
+      (!p_rc->constrained_gf_group &&
+       (refresh_frame->alt_ref_frame || is_intrnl_arf ||
+        (refresh_frame->golden_frame && !rc->is_src_frame_alt_ref)))) {
+    p_rc->last_boosted_qindex = qindex;
   }
-  if (current_frame->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+  if (current_frame->frame_type == KEY_FRAME) p_rc->last_kf_qindex = qindex;
 
   update_buffer_level(cpi, rc->projected_frame_size);
   rc->prev_avg_frame_bandwidth = rc->avg_frame_bandwidth;
@@ -1821,19 +1940,19 @@
                                   resize_rate_factor(&cpi->oxcf.frm_dim_cfg,
                                                      cm->width, cm->height));
   if (current_frame->frame_type != KEY_FRAME) {
-    rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
-    rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
-        rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+    p_rc->rolling_target_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+    p_rc->rolling_actual_bits = (int)ROUND_POWER_OF_TWO_64(
+        p_rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
   }
 
   // Actual bits spent
-  rc->total_actual_bits += rc->projected_frame_size;
-  rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+  p_rc->total_actual_bits += rc->projected_frame_size;
+  p_rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
 
   if (is_altref_enabled(cpi->oxcf.gf_cfg.lag_in_frames,
                         cpi->oxcf.gf_cfg.enable_auto_arf) &&
-      refresh_frame_flags->alt_ref_frame &&
+      refresh_frame->alt_ref_frame &&
       (current_frame->frame_type != KEY_FRAME && !frame_is_sframe(cm)))
     // Update the alternate reference frame stats as appropriate.
     update_alt_ref_frame_stats(cpi);
@@ -1857,6 +1976,7 @@
   cpi->rc.frames_to_key--;
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
+  cpi->rc.prev_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
 }
 
 int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
@@ -1940,9 +2060,9 @@
 
   // Special case code for 1 pass fixed Q mode tests
   if ((has_no_stats_stage(cpi)) && (oxcf->rc_cfg.mode == AOM_Q)) {
-    rc->max_gf_interval = FIXED_GF_INTERVAL;
-    rc->min_gf_interval = FIXED_GF_INTERVAL;
-    rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+    rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
+    rc->min_gf_interval = oxcf->gf_cfg.min_gf_interval;
+    rc->static_scene_max_gf_interval = rc->min_gf_interval + 1;
   } else {
     // Set Maximum gf/arf interval
     rc->max_gf_interval = oxcf->gf_cfg.max_gf_interval;
@@ -1958,7 +2078,7 @@
      * The no.of.stats available in the case of LAP is limited,
      * hence setting to max_gf_interval.
      */
-    if (cpi->lap_enabled)
+    if (cpi->ppi->lap_enabled)
       rc->static_scene_max_gf_interval = rc->max_gf_interval + 1;
     else
       rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
@@ -2005,10 +2125,11 @@
 // For VBR...adjustment to the frame target based on error from previous frames
 static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
   RATE_CONTROL *const rc = &cpi->rc;
-  int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  int64_t vbr_bits_off_target = p_rc->vbr_bits_off_target;
   const int stats_count =
-      cpi->twopass.stats_buf_ctx->total_stats != NULL
-          ? (int)cpi->twopass.stats_buf_ctx->total_stats->count
+      cpi->ppi->twopass.stats_buf_ctx->total_stats != NULL
+          ? (int)cpi->ppi->twopass.stats_buf_ctx->total_stats->count
           : 0;
   const int frame_window = AOMMIN(
       16, (int)(stats_count - (int)cpi->common.current_frame.frame_number));
@@ -2026,15 +2147,29 @@
   // Fast redistribution of bits arising from massive local undershoot.
   // Dont do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
-      rc->vbr_bits_off_target_fast) {
+      p_rc->vbr_bits_off_target_fast) {
     int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
     int fast_extra_bits;
-    fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+    fast_extra_bits =
+        (int)AOMMIN(p_rc->vbr_bits_off_target_fast, one_frame_bits);
     fast_extra_bits = (int)AOMMIN(
         fast_extra_bits,
-        AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
-    *this_frame_target += (int)fast_extra_bits;
-    rc->vbr_bits_off_target_fast -= fast_extra_bits;
+        AOMMAX(one_frame_bits / 8, p_rc->vbr_bits_off_target_fast / 8));
+    if (fast_extra_bits > 0) {
+      // Update this_frame_target only if additional bits are available from
+      // local undershoot.
+      *this_frame_target += (int)fast_extra_bits;
+    }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Store the fast_extra_bits of the frame and reduce it from
+    // vbr_bits_off_target_fast during postencode stage.
+    rc->frame_level_fast_extra_bits = fast_extra_bits;
+    // Retaining the condition to udpate during postencode stage since
+    // fast_extra_bits are calculated based on vbr_bits_off_target_fast.
+    cpi->do_update_vbr_bits_off_target_fast = 1;
+#else
+    p_rc->vbr_bits_off_target_fast -= fast_extra_bits;
+#endif
   }
 }
 
@@ -2052,16 +2187,17 @@
     const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
   static const int af_ratio = 10;
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   int64_t target;
 #if USE_ALTREF_FOR_ONE_PASS
   if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
       frame_update_type == ARF_UPDATE) {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
               af_ratio) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   } else {
-    target = ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
-             (rc->baseline_gf_interval + af_ratio - 1);
+    target = ((int64_t)rc->avg_frame_bandwidth * p_rc->baseline_gf_interval) /
+             (p_rc->baseline_gf_interval + af_ratio - 1);
   }
   if (target > INT_MAX) target = INT_MAX;
 #else
@@ -2081,9 +2217,10 @@
     const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
   const RateControlCfg *rc_cfg = &oxcf->rc_cfg;
-  const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
-  const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+  const int64_t diff = p_rc->optimal_buffer_level - p_rc->buffer_level;
+  const int64_t one_pct_bits = 1 + p_rc->optimal_buffer_level / 100;
   int min_frame_target =
       AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
   int target;
@@ -2091,17 +2228,17 @@
   if (rc_cfg->gf_cbr_boost_pct) {
     const int af_ratio_pct = rc_cfg->gf_cbr_boost_pct + 100;
     if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
-      target =
-          (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
-          (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval *
+                af_ratio_pct) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     } else {
-      target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
-               (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+      target = (rc->avg_frame_bandwidth * p_rc->baseline_gf_interval * 100) /
+               (p_rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
     }
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2133,11 +2270,15 @@
 
 int av1_calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
   const RATE_CONTROL *rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
   int target;
   if (cpi->common.current_frame.frame_number == 0) {
-    target = ((rc->starting_buffer_level / 2) > INT_MAX)
+    target = ((p_rc->starting_buffer_level / 2) > INT_MAX)
                  ? INT_MAX
-                 : (int)(rc->starting_buffer_level / 2);
+                 : (int)(p_rc->starting_buffer_level / 2);
+    if (cpi->svc.number_temporal_layers > 1 && target < (INT_MAX >> 2)) {
+      target = target << AOMMIN(2, (cpi->svc.number_temporal_layers - 1));
+    }
   } else {
     int kf_boost = 32;
     double framerate = cpi->framerate;
@@ -2151,6 +2292,86 @@
   return av1_rc_clamp_iframe_target_size(cpi, target);
 }
 
+#define DEFAULT_KF_BOOST_RT 2300
+#define DEFAULT_GF_BOOST_RT 2000
+
+static void set_baseline_gf_interval(AV1_COMP *cpi, FRAME_TYPE frame_type) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+  if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
+    av1_cyclic_refresh_set_golden_update(cpi);
+  else
+    p_rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+  if (p_rc->baseline_gf_interval > rc->frames_to_key)
+    p_rc->baseline_gf_interval = rc->frames_to_key;
+  p_rc->gfu_boost = DEFAULT_GF_BOOST_RT;
+  p_rc->constrained_gf_group =
+      (p_rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
+  rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+  cpi->gf_frame_index = 0;
+  // SVC does not use GF as periodic boost.
+  // TODO(marpan): Find better way to disable this for SVC.
+  if (cpi->ppi->use_svc) {
+    SVC *const svc = &cpi->svc;
+    p_rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
+    p_rc->gfu_boost = 1;
+    p_rc->constrained_gf_group = 0;
+    rc->frames_till_gf_update_due = p_rc->baseline_gf_interval;
+    for (int layer = 0;
+         layer < svc->number_spatial_layers * svc->number_temporal_layers;
+         ++layer) {
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      lc->p_rc.baseline_gf_interval = p_rc->baseline_gf_interval;
+      lc->p_rc.gfu_boost = p_rc->gfu_boost;
+      lc->p_rc.constrained_gf_group = p_rc->constrained_gf_group;
+      lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
+      lc->group_index = 0;
+    }
+  }
+  gf_group->size = p_rc->baseline_gf_interval;
+  gf_group->update_type[0] = (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+  gf_group->refbuf_state[cpi->gf_frame_index] =
+      (frame_type == KEY_FRAME) ? REFBUF_RESET : REFBUF_UPDATE;
+}
+
+void av1_adjust_gf_refresh_qp_one_pass_rt(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
+  const int resize_pending = is_frame_resize_pending(cpi);
+  if (!resize_pending && !rc->high_source_sad) {
+    // Check if we should disable GF refresh (if period is up),
+    // or force a GF refresh update (if we are at least halfway through
+    // period) based on QP. Look into add info on segment deltaq.
+    PRIMARY_RATE_CONTROL *p_rc = &cpi->ppi->p_rc;
+    const int avg_qp = p_rc->avg_frame_qindex[INTER_FRAME];
+    int gf_update_changed = 0;
+    int thresh = 87;
+    if (rc->frames_till_gf_update_due == 1 &&
+        cm->quant_params.base_qindex > avg_qp) {
+      // Disable GF refresh since QP is above the runninhg average QP.
+      svc->refresh[svc->gld_idx_1layer] = 0;
+      gf_update_changed = 1;
+    } else if (rc->frames_till_gf_update_due <
+                   (p_rc->baseline_gf_interval >> 1) &&
+               cm->quant_params.base_qindex < thresh * avg_qp / 100) {
+      // Force refresh since QP is well below average QP.
+      svc->refresh[svc->gld_idx_1layer] = 1;
+      gf_update_changed = 1;
+    }
+    if (gf_update_changed) {
+      set_baseline_gf_interval(cpi, INTER_FRAME);
+      int refresh_mask = 0;
+      for (unsigned int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+        int ref_frame_map_idx = svc->ref_idx[i];
+        refresh_mask |= svc->refresh[ref_frame_map_idx] << ref_frame_map_idx;
+      }
+      cm->current_frame.refresh_frame_flags = refresh_mask;
+    }
+  }
+}
+
 /*!\brief Setup the reference prediction structure for 1 pass real-time
  *
  * Set the reference prediction structure for 1 layer.
@@ -2180,6 +2401,7 @@
   int last_idx_refresh = 0;
   int gld_idx = 0;
   int alt_ref_idx = 0;
+  int last2_idx = 0;
   ext_refresh_frame_flags->update_pending = 1;
   svc->set_ref_frame_config = 1;
   ext_flags->ref_frame_flags = 0;
@@ -2188,10 +2410,12 @@
   ext_refresh_frame_flags->alt_ref_frame = 0;
   for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) svc->ref_idx[i] = 7;
   for (int i = 0; i < REF_FRAMES; ++i) svc->refresh[i] = 0;
-  // Always reference LAST, GOLDEN, ALTREF
+  // Set the reference frame flags.
   ext_flags->ref_frame_flags ^= AOM_LAST_FLAG;
-  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
   ext_flags->ref_frame_flags ^= AOM_ALT_FLAG;
+  ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1])
+    ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG;
   const int sh = 7 - gld_fixed_slot;
   // Moving index slot for last: 0 - (sh - 1).
   if (cm->current_frame.frame_number > 1)
@@ -2209,10 +2433,19 @@
   // Moving index for alt_ref, lag behind LAST by lag_alt frames.
   if (cm->current_frame.frame_number > lag_alt)
     alt_ref_idx = ((cm->current_frame.frame_number - lag_alt) % sh);
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    // Moving index for LAST2, lag behind LAST by 2 frames.
+    if (cm->current_frame.frame_number > 2)
+      last2_idx = ((cm->current_frame.frame_number - 2) % sh);
+  }
   svc->ref_idx[0] = last_idx;          // LAST
   svc->ref_idx[1] = last_idx_refresh;  // LAST2 (for refresh of last).
-  svc->ref_idx[3] = gld_idx;           // GOLDEN
-  svc->ref_idx[6] = alt_ref_idx;       // ALT_REF
+  if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) {
+    svc->ref_idx[1] = last2_idx;         // LAST2
+    svc->ref_idx[2] = last_idx_refresh;  // LAST3 (for refresh of last).
+  }
+  svc->ref_idx[3] = gld_idx;      // GOLDEN
+  svc->ref_idx[6] = alt_ref_idx;  // ALT_REF
   // Refresh this slot, which will become LAST on next frame.
   svc->refresh[last_idx_refresh] = 1;
   // Update GOLDEN on period for fixed slot case.
@@ -2220,6 +2453,7 @@
     ext_refresh_frame_flags->golden_frame = 1;
     svc->refresh[gld_idx] = 1;
   }
+  svc->gld_idx_1layer = gld_idx;
 }
 
 /*!\brief Check for scene detection, for 1 pass real-time mode.
@@ -2272,9 +2506,9 @@
     int num_samples = 0;
     const int thresh = 6;
     // SAD is computed on 64x64 blocks
-    const int sb_size_by_mb = (cm->seq_params.sb_size == BLOCK_128X128)
-                                  ? (cm->seq_params.mib_size >> 1)
-                                  : cm->seq_params.mib_size;
+    const int sb_size_by_mb = (cm->seq_params->sb_size == BLOCK_128X128)
+                                  ? (cm->seq_params->mib_size >> 1)
+                                  : cm->seq_params->mib_size;
     const int sb_cols = (num_mi_cols + sb_size_by_mb - 1) / sb_size_by_mb;
     const int sb_rows = (num_mi_rows + sb_size_by_mb - 1) / sb_size_by_mb;
     uint64_t sum_sq_thresh = 10000;  // sum = sqrt(thresh / 64*64)) ~1.5
@@ -2290,12 +2524,12 @@
              (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) &&
              ((sbi_row % 2 == 0 && sbi_col % 2 == 0) ||
               (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) {
-          tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
-                                           last_src_ystride);
+          tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y,
+                                                last_src_ystride);
           if (check_light_change) {
             unsigned int sse, variance;
-            variance = cpi->fn_ptr[bsize].vf(src_y, src_ystride, last_src_y,
-                                             last_src_ystride, &sse);
+            variance = cpi->ppi->fn_ptr[bsize].vf(
+                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
             // Note: sse - variance = ((sum * sum) >> 12)
             // Detect large lighting change.
             if (variance < (sse >> 1) && (sse - variance) > sum_sq_thresh) {
@@ -2332,9 +2566,6 @@
   }
 }
 
-#define DEFAULT_KF_BOOST_RT 2300
-#define DEFAULT_GF_BOOST_RT 2000
-
 /*!\brief Set the GF baseline interval for 1 pass real-time mode.
  *
  *
@@ -2348,52 +2579,14 @@
 static int set_gf_interval_update_onepass_rt(AV1_COMP *cpi,
                                              FRAME_TYPE frame_type) {
   RATE_CONTROL *const rc = &cpi->rc;
-  GF_GROUP *const gf_group = &cpi->gf_group;
-  ResizePendingParams *const resize_pending_params =
-      &cpi->resize_pending_params;
   int gf_update = 0;
-  const int resize_pending =
-      (resize_pending_params->width && resize_pending_params->height &&
-       (cpi->common.width != resize_pending_params->width ||
-        cpi->common.height != resize_pending_params->height));
+  const int resize_pending = is_frame_resize_pending(cpi);
   // GF update based on frames_till_gf_update_due, also
   // force upddate on resize pending frame or for scene change.
   if ((resize_pending || rc->high_source_sad ||
        rc->frames_till_gf_update_due == 0) &&
       cpi->svc.temporal_layer_id == 0 && cpi->svc.spatial_layer_id == 0) {
-    if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ)
-      av1_cyclic_refresh_set_golden_update(cpi);
-    else
-      rc->baseline_gf_interval = MAX_GF_INTERVAL;
-    if (rc->baseline_gf_interval > rc->frames_to_key)
-      rc->baseline_gf_interval = rc->frames_to_key;
-    rc->gfu_boost = DEFAULT_GF_BOOST_RT;
-    rc->constrained_gf_group =
-        (rc->baseline_gf_interval >= rc->frames_to_key) ? 1 : 0;
-    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    cpi->gf_frame_index = 0;
-    // SVC does not use GF as periodic boost.
-    // TODO(marpan): Find better way to disable this for SVC.
-    if (cpi->use_svc) {
-      SVC *const svc = &cpi->svc;
-      rc->baseline_gf_interval = MAX_STATIC_GF_GROUP_LENGTH - 1;
-      rc->gfu_boost = 1;
-      rc->constrained_gf_group = 0;
-      rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-      for (int layer = 0;
-           layer < svc->number_spatial_layers * svc->number_temporal_layers;
-           ++layer) {
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        lc->rc.baseline_gf_interval = rc->baseline_gf_interval;
-        lc->rc.gfu_boost = rc->gfu_boost;
-        lc->rc.constrained_gf_group = rc->constrained_gf_group;
-        lc->rc.frames_till_gf_update_due = rc->frames_till_gf_update_due;
-        lc->group_index = 0;
-      }
-    }
-    gf_group->size = rc->baseline_gf_interval;
-    gf_group->update_type[0] =
-        (frame_type == KEY_FRAME) ? KF_UPDATE : GF_UPDATE;
+    set_baseline_gf_interval(cpi, frame_type);
     gf_update = 1;
   }
   return gf_update;
@@ -2402,6 +2595,7 @@
 static void resize_reset_rc(AV1_COMP *cpi, int resize_width, int resize_height,
                             int prev_width, int prev_height) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
   double tot_scale_change = 1.0;
   int target_bits_per_frame;
@@ -2410,16 +2604,16 @@
   tot_scale_change = (double)(resize_width * resize_height) /
                      (double)(prev_width * prev_height);
   // Reset buffer level to optimal, update target size.
-  rc->buffer_level = rc->optimal_buffer_level;
-  rc->bits_off_target = rc->optimal_buffer_level;
+  p_rc->buffer_level = p_rc->optimal_buffer_level;
+  p_rc->bits_off_target = p_rc->optimal_buffer_level;
   rc->this_frame_target =
       av1_calc_pframe_target_size_one_pass_cbr(cpi, INTER_FRAME);
   target_bits_per_frame = rc->this_frame_target;
   if (tot_scale_change > 4.0)
-    rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+    p_rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
   else if (tot_scale_change > 1.0)
-    rc->avg_frame_qindex[INTER_FRAME] =
-        (rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
+    p_rc->avg_frame_qindex[INTER_FRAME] =
+        (p_rc->avg_frame_qindex[INTER_FRAME] + rc->worst_quality) >> 1;
   active_worst_quality = calc_active_worst_quality_no_stats_cbr(cpi);
   qindex = av1_rc_regulate_q(cpi, target_bits_per_frame, rc->best_quality,
                              active_worst_quality, resize_width, resize_height);
@@ -2427,7 +2621,7 @@
   // and if so, reduce the rate correction factor (since likely can afford
   // lower q for resized frame).
   if (tot_scale_change < 1.0 && qindex > 90 * cpi->rc.worst_quality / 100)
-    rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
+    p_rc->rate_correction_factors[INTER_NORMAL] *= 0.85;
   // Apply the same rate control reset to all temporal layers.
   for (int tl = 0; tl < svc->number_temporal_layers; tl++) {
     LAYER_CONTEXT *lc = NULL;
@@ -2435,10 +2629,10 @@
                                  svc->number_temporal_layers +
                              tl];
     lc->rc.resize_state = rc->resize_state;
-    lc->rc.buffer_level = lc->rc.optimal_buffer_level;
-    lc->rc.bits_off_target = lc->rc.optimal_buffer_level;
-    lc->rc.rate_correction_factors[INTER_FRAME] =
-        rc->rate_correction_factors[INTER_FRAME];
+    lc->p_rc.buffer_level = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.bits_off_target = lc->p_rc.optimal_buffer_level;
+    lc->p_rc.rate_correction_factors[INTER_FRAME] =
+        p_rc->rate_correction_factors[INTER_FRAME];
   }
   // If resize is back up: check if projected q index is too much above the
   // previous index, and if so, reduce the rate correction factor
@@ -2446,10 +2640,11 @@
   // Also check if projected qindex is close to previous qindex, if so
   // increase correction factor (to push qindex higher and avoid overshoot).
   if (tot_scale_change >= 1.0) {
-    if (tot_scale_change < 4.0 && qindex > 130 * rc->last_q[INTER_FRAME] / 100)
-      rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
-    if (qindex <= 120 * rc->last_q[INTER_FRAME] / 100)
-      rc->rate_correction_factors[INTER_NORMAL] *= 2.0;
+    if (tot_scale_change < 4.0 &&
+        qindex > 130 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 0.8;
+    if (qindex <= 120 * p_rc->last_q[INTER_FRAME] / 100)
+      p_rc->rate_correction_factors[INTER_NORMAL] *= 2.0;
   }
 }
 
@@ -2468,6 +2663,7 @@
 static void dynamic_resize_one_pass_cbr(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   RESIZE_ACTION resize_action = NO_RESIZE;
   const int avg_qp_thr1 = 70;
   const int avg_qp_thr2 = 50;
@@ -2489,8 +2685,9 @@
   // Ignore samples close to key frame, since QP is usually high after key.
   if (cpi->rc.frames_since_key > cpi->framerate) {
     const int window = AOMMIN(30, (int)(2 * cpi->framerate));
-    rc->resize_avg_qp += rc->last_q[INTER_FRAME];
-    if (cpi->rc.buffer_level < (int)(30 * rc->optimal_buffer_level / 100))
+    rc->resize_avg_qp += p_rc->last_q[INTER_FRAME];
+    if (cpi->ppi->p_rc.buffer_level <
+        (int)(30 * p_rc->optimal_buffer_level / 100))
       ++rc->resize_buffer_underflow;
     ++rc->resize_count;
     // Check for resize action every "window" frames.
@@ -2552,8 +2749,9 @@
                                 EncodeFrameParams *const frame_params,
                                 unsigned int frame_flags) {
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *const gf_group = &cpi->gf_group;
+  GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   SVC *const svc = &cpi->svc;
   ResizePendingParams *const resize_pending_params =
       &cpi->resize_pending_params;
@@ -2563,25 +2761,25 @@
                        svc->number_temporal_layers);
   // Turn this on to explicitly set the reference structure rather than
   // relying on internal/default structure.
-  if (cpi->use_svc) {
+  if (cpi->ppi->use_svc) {
     av1_update_temporal_layer_framerate(cpi);
     av1_restore_layer_context(cpi);
   }
   // Set frame type.
-  if ((!cpi->use_svc && rc->frames_to_key == 0) ||
-      (cpi->use_svc && svc->spatial_layer_id == 0 &&
+  if ((!cpi->ppi->use_svc && rc->frames_to_key == 0) ||
+      (cpi->ppi->use_svc && svc->spatial_layer_id == 0 &&
        (cpi->oxcf.kf_cfg.key_freq_max == 0 ||
         svc->current_superframe % cpi->oxcf.kf_cfg.key_freq_max == 0)) ||
       (frame_flags & FRAMEFLAGS_KEY)) {
     frame_params->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
+    p_rc->this_key_frame_forced =
         cm->current_frame.frame_number != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.kf_cfg.key_freq_max;
-    rc->kf_boost = DEFAULT_KF_BOOST_RT;
+    p_rc->kf_boost = DEFAULT_KF_BOOST_RT;
     gf_group->update_type[cpi->gf_frame_index] = KF_UPDATE;
     gf_group->frame_type[cpi->gf_frame_index] = KEY_FRAME;
     gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_RESET;
-    if (cpi->use_svc) {
+    if (cpi->ppi->use_svc) {
       if (cm->current_frame.frame_number > 0)
         av1_svc_reset_temporal_layers(cpi, 1);
       svc->layer_context[layer].is_key_frame = 1;
@@ -2591,7 +2789,7 @@
     gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE;
     gf_group->frame_type[cpi->gf_frame_index] = INTER_FRAME;
     gf_group->refbuf_state[cpi->gf_frame_index] = REFBUF_UPDATE;
-    if (cpi->use_svc) {
+    if (cpi->ppi->use_svc) {
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       lc->is_key_frame =
           svc->spatial_layer_id == 0
@@ -2600,7 +2798,7 @@
     }
   }
   // Check for scene change, for non-SVC for now.
-  if (!cpi->use_svc && cpi->sf.rt_sf.check_scene_detection)
+  if (!cpi->ppi->use_svc && cpi->sf.rt_sf.check_scene_detection)
     rc_scene_detection_onepass_rt(cpi);
   // Check for dynamic resize, for single spatial layer for now.
   // For temporal layers only check on base temporal layer.
@@ -2618,9 +2816,7 @@
       resize_pending_params->width = cpi->oxcf.frm_dim_cfg.width;
       resize_pending_params->height = cpi->oxcf.frm_dim_cfg.height;
     }
-  } else if (resize_pending_params->width && resize_pending_params->height &&
-             (cpi->common.width != resize_pending_params->width ||
-              cpi->common.height != resize_pending_params->height)) {
+  } else if (is_frame_resize_pending(cpi)) {
     resize_reset_rc(cpi, resize_pending_params->width,
                     resize_pending_params->height, cm->width, cm->height);
   }
@@ -2648,11 +2844,21 @@
   av1_rc_set_frame_target(cpi, target, cm->width, cm->height);
   rc->base_frame_target = target;
   cm->current_frame.frame_type = frame_params->frame_type;
+  // For fixed mode SVC: if KSVC is enabled remove inter layer
+  // prediction on spatial enhancement layer frames for frames
+  // whose base is not KEY frame.
+  if (cpi->ppi->use_svc && !svc->use_flexible_mode && svc->ksvc_fixed_mode &&
+      svc->number_spatial_layers > 1 &&
+      !svc->layer_context[layer].is_key_frame) {
+    ExternalFlags *const ext_flags = &cpi->ext_flags;
+    ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG;
+  }
 }
 
 int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SPEED_FEATURES *const sf = &cpi->sf;
   int thresh_qp = 7 * (rc->worst_quality >> 3);
   // Lower thresh_qp for video (more overshoot at lower Q) to be
@@ -2662,7 +2868,7 @@
   if (sf->rt_sf.overshoot_detection_cbr == FAST_DETECTION_MAXQ &&
       cm->quant_params.base_qindex < thresh_qp) {
     double rate_correction_factor =
-        cpi->rc.rate_correction_factors[INTER_NORMAL];
+        cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL];
     const int target_size = cpi->rc.avg_frame_bandwidth;
     double new_correction_factor;
     int target_bits_per_mb;
@@ -2673,9 +2879,9 @@
     // these parameters will affect QP selection for subsequent frames. If they
     // have settled down to a very different (low QP) state, then not adjusting
     // them may cause next frame to select low QP and overshoot again.
-    cpi->rc.avg_frame_qindex[INTER_FRAME] = *q;
-    rc->buffer_level = rc->optimal_buffer_level;
-    rc->bits_off_target = rc->optimal_buffer_level;
+    p_rc->avg_frame_qindex[INTER_FRAME] = *q;
+    p_rc->buffer_level = p_rc->optimal_buffer_level;
+    p_rc->bits_off_target = p_rc->optimal_buffer_level;
     // Reset rate under/over-shoot flags.
     cpi->rc.rc_1_frame = 0;
     cpi->rc.rc_2_frame = 0;
@@ -2684,7 +2890,7 @@
         (int)(((uint64_t)target_size << BPER_MB_NORMBITS) / cm->mi_params.MBs);
     // Rate correction factor based on target_bits_per_mb and qp (==max_QP).
     // This comes from the inverse computation of vp9_rc_bits_per_mb().
-    q2 = av1_convert_qindex_to_q(*q, cm->seq_params.bit_depth);
+    q2 = av1_convert_qindex_to_q(*q, cm->seq_params->bit_depth);
     enumerator = 1800000;  // Factor for inter frame.
     enumerator += (int)(enumerator * q2) >> 12;
     new_correction_factor = (double)target_bits_per_mb * q2 / enumerator;
@@ -2693,10 +2899,29 @@
           AOMMIN(2.0 * rate_correction_factor, new_correction_factor);
       if (rate_correction_factor > MAX_BPB_FACTOR)
         rate_correction_factor = MAX_BPB_FACTOR;
-      cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+      cpi->ppi->p_rc.rate_correction_factors[INTER_NORMAL] =
+          rate_correction_factor;
     }
     return 1;
   } else {
     return 0;
   }
 }
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(angiebird): move this function to tpl_model.c
+/*
+ * Compute the q_indices for the entire GOP.
+ * Intended to be used only with AOM_Q mode.
+ */
+void av1_q_mode_compute_gop_q_indices(int gf_frame_index, int base_q_index,
+                                      const double *qstep_ratio_list,
+                                      aom_bit_depth_t bit_depth,
+                                      const struct GF_GROUP *gf_group,
+                                      int *q_index_list) {
+  for (int i = gf_frame_index; i < gf_group->size; ++i) {
+    q_index_list[i] = av1_get_q_index_from_qstep_ratio(
+        base_q_index, qstep_ratio_list[i], bit_depth);
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY

diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 9c96c8d..07c98a4 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h

@@ -129,11 +129,6 @@
   int this_frame_target;  // Actual frame target after rc adjustment.
 
   /*!
-   * Target bit budget for the current GF / ARF group of frame.
-   */
-  int64_t gf_group_bits;
-
-  /*!
    * Projected size for current frame
    */
   int projected_frame_size;
@@ -149,36 +144,6 @@
   int sb64_target_rate;
 
   /*!
-   * Q used on last encoded frame of the given type.
-   */
-  int last_q[FRAME_TYPES];
-
-  /*!
-   * Q used for last boosted (non leaf) frame (GF/KF/ARF)
-   */
-  int last_boosted_qindex;
-
-  /*!
-   * Q used for last boosted (non leaf) frame
-   */
-  int last_kf_qindex;
-
-  /*!
-   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
-   */
-  int gfu_boost;
-  /*!
-   * Boost factor used to calculate the extra bits allocated to the key frame
-   */
-  int kf_boost;
-
-  /*!
-   * Correction factors used to adjust the q estimate for a given target rate
-   * in the encode loop.
-   */
-  double rate_correction_factors[RATE_FACTOR_LEVELS];
-
-  /*!
    * Number of frames since the last ARF / GF.
    */
   int frames_since_golden;
@@ -193,29 +158,10 @@
    */
   int intervals_till_gf_calculate_due;
 
-  /*!
-   * Stores the determined gf group lengths for a set of gf groups
-   */
-  int gf_intervals[MAX_NUM_GF_INTERVALS];
-
-  /*!
-   * The current group's index into gf_intervals[]
-   */
-  int cur_gf_index;
-
   /*!\cond */
-  int num_regions;
-  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
-  double cor_coeff[MAX_FIRSTPASS_ANALYSIS_FRAMES];
-  double noise_var[MAX_FIRSTPASS_ANALYSIS_FRAMES];
-  int regions_offset;  // offset of regions from the last keyframe
-  int frames_till_regions_update;
-
   int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
-  int baseline_gf_interval;
-  int constrained_gf_group;
   /*!\endcond */
   /*!
    * Frames before the next key frame
@@ -223,8 +169,7 @@
   int frames_to_key;
   /*!\cond */
   int frames_since_key;
-  int this_key_frame_forced;
-  int next_key_frame_forced;
+  int frames_to_fwd_kf;
   int is_src_frame_alt_ref;
   int sframe_due;
 
@@ -239,27 +184,10 @@
 
   int ni_av_qi;
   int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex[FRAME_TYPES];
-  double tot_q;
-  double avg_q;
-
-  int64_t buffer_level;
-  int64_t bits_off_target;
-  int64_t vbr_bits_off_target;
-  int64_t vbr_bits_off_target_fast;
 
   int decimation_factor;
   int decimation_count;
 
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int rate_error_estimate;
-
-  int64_t total_actual_bits;
-  int64_t total_target_bits;
-
   /*!\endcond */
   /*!
    * User specified maximum Q allowed for current frame
@@ -270,18 +198,6 @@
    */
   int best_quality;
 
-  /*!
-   * Initial buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t starting_buffer_level;
-  /*!
-   * Optimum / target buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t optimal_buffer_level;
-  /*!
-   * Maximum target buffuer level in ms for CBR / low delay encoding
-   */
-  int64_t maximum_buffer_size;
   /*!\cond */
 
   // rate control history for last frame(1) and the frame before(2).
@@ -293,34 +209,13 @@
   int q_1_frame;
   int q_2_frame;
 
-  float_t arf_boost_factor;
-
   /*!\endcond */
   /*!
-   * Q index used for ALT frame
-   */
-  int arf_q;
-  /*!
    * Proposed maximum alloed Q for current frame
    */
   int active_worst_quality;
-  /*!
-   * Proposed minimum allowed Q different layers in a coding pyramid
-   */
-  int active_best_quality[MAX_ARF_LAYERS + 1];
 
   /*!\cond */
-  int base_layer_qp;
-
-  // Total number of stats used only for kf_boost calculation.
-  int num_stats_used_for_kf_boost;
-  // Total number of stats used only for gfu_boost calculation.
-  int num_stats_used_for_gfu_boost;
-  // Total number of stats required by gfu_boost calculation.
-  int num_stats_required_for_gfu_boost;
-  int next_is_fwd_key;
-  int enable_scenecut_detection;
-  int use_arf_in_this_kf_group;
   // Track amount of low motion in scene
   int avg_frame_low_motion;
 
@@ -329,16 +224,195 @@
   int resize_avg_qp;
   int resize_buffer_underflow;
   int resize_count;
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  int frame_level_fast_extra_bits;
+  double frame_level_rate_correction_factors[RATE_FACTOR_LEVELS];
+#endif
   /*!\endcond */
 } RATE_CONTROL;
 
-/*!\cond */
+/*!
+ * \brief  Primary Rate Control parameters and status
+ */
+typedef struct {
+  // Sub-gop level Rate targetting variables
+
+  /*!
+   * Target bit budget for the current GF / ARF group of frame.
+   */
+  int64_t gf_group_bits;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to the key frame
+   */
+  int kf_boost;
+
+  /*!
+   * Boost factor used to calculate the extra bits allocated to ARFs and GFs
+   */
+  int gfu_boost;
+
+  /*!
+   * Stores the determined gf group lengths for a set of gf groups
+   */
+  int gf_intervals[MAX_NUM_GF_INTERVALS];
+
+  /*!
+   * The current group's index into gf_intervals[]
+   */
+  int cur_gf_index;
+
+  /*!\cond */
+  int num_regions;
+
+  REGIONS regions[MAX_FIRSTPASS_ANALYSIS_FRAMES];
+  int regions_offset;  // offset of regions from the last keyframe
+  int frames_till_regions_update;
+
+  int baseline_gf_interval;
+
+  int constrained_gf_group;
+
+  int this_key_frame_forced;
+
+  int next_key_frame_forced;
+  /*!\endcond */
+
+  /*!
+   * Initial buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t starting_buffer_level;
+
+  /*!
+   * Optimum / target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t optimal_buffer_level;
+
+  /*!
+   * Maximum target buffuer level in ms for CBR / low delay encoding
+   */
+  int64_t maximum_buffer_size;
+
+  /*!
+   * Q index used for ALT frame
+   */
+  int arf_q;
+
+  /*!\cond */
+  float_t arf_boost_factor;
+
+  int base_layer_qp;
+
+  // Total number of stats used only for kf_boost calculation.
+  int num_stats_used_for_kf_boost;
+
+  // Total number of stats used only for gfu_boost calculation.
+  int num_stats_used_for_gfu_boost;
+
+  // Total number of stats required by gfu_boost calculation.
+  int num_stats_required_for_gfu_boost;
+
+  int enable_scenecut_detection;
+
+  int use_arf_in_this_kf_group;
+
+  int ni_frames;
+
+  double tot_q;
+  /*!\endcond */
+
+  /*!
+   * Q used for last boosted (non leaf) frame
+   */
+  int last_kf_qindex;
+
+  /*!
+   * Average of q index of previous encoded frames in a sequence.
+   */
+  int avg_frame_qindex[FRAME_TYPES];
+
+  /*!
+   * Proposed minimum allowed Q different layers in a coding pyramid
+   */
+  int active_best_quality[MAX_ARF_LAYERS + 1];
+
+  /*!
+   * Q used for last boosted (non leaf) frame (GF/KF/ARF)
+   */
+  int last_boosted_qindex;
+
+  /*!
+   * Average Q value of previous inter frames
+   */
+  double avg_q;
+
+  /*!
+   * Q used on last encoded frame of the given type.
+   */
+  int last_q[FRAME_TYPES];
+
+  /*!
+   * Correction factors used to adjust the q estimate for a given target rate
+   * in the encode loop.
+   */
+  double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+  /*!
+   * Current total consumed bits.
+   */
+  int64_t total_actual_bits;
+
+  /*!
+   * Current total target bits.
+   */
+  int64_t total_target_bits;
+
+  /*!
+   * Current buffer level.
+   */
+  int64_t buffer_level;
+
+  /*!
+   * PCT rc error.
+   */
+  int rate_error_estimate;
+
+  /*!
+   * Error bits available from previously encoded frames.
+   */
+  int64_t vbr_bits_off_target;
+
+  /*!
+   * Error bits available from previously encoded frames undershoot.
+   */
+  int64_t vbr_bits_off_target_fast;
+
+  /*!
+   * Total bits deviated from the average frame target, from previously
+   * encoded frames.
+   */
+  int64_t bits_off_target;
+
+  /*!
+   * Rolling monitor target bits updated based on current frame target size.
+   */
+  int rolling_target_bits;
+
+  /*!
+   * Rolling monitor actual bits updated based on current frame final projected
+   * size.
+   */
+  int rolling_actual_bits;
+} PRIMARY_RATE_CONTROL;
 
 struct AV1_COMP;
 struct AV1EncoderConfig;
+struct GF_GROUP;
 
-void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
-                 RATE_CONTROL *rc);
+void av1_primary_rc_init(const struct AV1EncoderConfig *oxcf,
+                         PRIMARY_RATE_CONTROL *p_rc);
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, RATE_CONTROL *rc);
 
 int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
                            double correction_factor, aom_bit_depth_t bit_depth,
@@ -363,8 +437,7 @@
 //
 // Then, call encode_frame_to_data_rate() to perform the
 // actual encode. This function will in turn call encode_frame()
-// one or more times, followed by one of:
-//   av1_rc_postencode_update()
+// one or more times, followed by:
 //   av1_rc_postencode_update_drop_frame()
 //
 // The majority of rate control parameters are only expected
@@ -396,8 +469,11 @@
  *
  * \return None but updates the relevant rate correction factor in cpi->rc
  */
-void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
-                                           int height);
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                           int is_encode_stage,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                                           int width, int height);
 /*!\cond */
 
 // Decide if we should drop this frame: For 1-pass CBR.
@@ -416,7 +492,6 @@
  *
  * \ingroup rate_control
  * \param[in]       cpi          Top level encoder structure
- * \param[in,out]   rc           Top level rate control structure
  * \param[in]       width        Coded frame width
  * \param[in]       height       Coded frame height
  * \param[in]       gf_index     Index of this frame in the golden frame group
@@ -425,9 +500,8 @@
  * \return Returns selected q index to be used for encoding this frame.
  * Also, updates \c rc->arf_q.
  */
-int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, RATE_CONTROL *rc,
-                             int width, int height, int gf_index,
-                             int *bottom_index, int *top_index);
+int av1_rc_pick_q_and_bounds(const struct AV1_COMP *cpi, int width, int height,
+                             int gf_index, int *bottom_index, int *top_index);
 
 /*!\brief Estimates q to achieve a target bits per frame
  *
@@ -490,6 +564,8 @@
 void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target, int width,
                              int height);
 
+void av1_adjust_gf_refresh_qp_one_pass_rt(struct AV1_COMP *cpi);
+
 void av1_set_reference_structure_one_pass_rt(struct AV1_COMP *cpi,
                                              int gf_update);
 
@@ -582,6 +658,63 @@
  */
 int av1_encodedframe_overshoot_cbr(struct AV1_COMP *cpi, int *q);
 
+#if !CONFIG_REALTIME_ONLY
+/*!\brief Compute the q_indices for the entire GOP.
+ *
+ * \param[in]       gf_frame_index    Index of the current frame
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       qstep_ratio_list  Stores the qstep_ratio for each frame
+ * \param[in]       bit_depth         Bit depth
+ * \param[in]       gf_group          Pointer to the GOP
+ * \param[out]      q_index_list      An array to store output gop q indices.
+ *                                    the array size should be equal or
+ *                                    greater than gf_group.size()
+ */
+void av1_q_mode_compute_gop_q_indices(int gf_frame_index, int base_q_index,
+                                      const double *qstep_ratio_list,
+                                      aom_bit_depth_t bit_depth,
+                                      const struct GF_GROUP *gf_group,
+                                      int *q_index_list);
+#endif  // !CONFIG_REALTIME_ONLY
+
+/*!\brief Compute the q_indices for a single frame.
+ *
+ * Intended to be used with AOM_Q mode.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gf_update_type    GOP update type
+ * \param[in]       gf_pyramid_level  GOP level of the current frame
+ * \param[in]       arf_q             ARF q_index
+ *
+ * \return Returns the q_index for the current frame.
+ */
+int av1_q_mode_get_q_index(int base_q_index, int gf_update_type,
+                           int gf_pyramid_level, int arf_q);
+
+/*!\brief Compute the q_indices for the ARF of a GOP.
+ *
+ * \param[in]       base_q_index      Base q index
+ * \param[in]       gfu_boost         GFU boost
+ * \param[in]       bit_depth         Bit depth
+ * \param[in]       arf_boost_factor  ARF boost factor
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index(int base_q_index, int gfu_boost, int bit_depth,
+                        double arf_boost_factor);
+
+#if !CONFIG_REALTIME_ONLY
+struct TplDepFrame;
+/*!\brief Compute the q_indices for the ARF of a GOP in Q mode.
+ *
+ * \param[in]       cpi               Top level encoder structure
+ * \param[in]       tpl_frame         Tpl Frame stats
+ *
+ * \return Returns the q_index for the ARF frame.
+ */
+int av1_get_arf_q_index_q_mode(struct AV1_COMP *cpi,
+                               struct TplDepFrame *tpl_frame);
+#endif
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/rc_utils.h b/av1/encoder/rc_utils.h
index 98cec2e..a518555 100644
--- a/av1/encoder/rc_utils.h
+++ b/av1/encoder/rc_utils.h

@@ -19,26 +19,52 @@
 extern "C" {
 #endif
 
-static AOM_INLINE void set_rc_buffer_sizes(RATE_CONTROL *rc,
-                                           const RateControlCfg *rc_cfg) {
+static AOM_INLINE void check_reset_rc_flag(AV1_COMP *cpi) {
+  RATE_CONTROL *rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
+  if (cpi->common.current_frame.frame_number >
+      (unsigned int)cpi->svc.number_spatial_layers) {
+    if (cpi->ppi->use_svc) {
+      av1_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->prev_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->prev_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        p_rc->bits_off_target = p_rc->optimal_buffer_level;
+        p_rc->buffer_level = p_rc->optimal_buffer_level;
+      }
+    }
+  }
+}
+
+static AOM_INLINE void set_primary_rc_buffer_sizes(const AV1EncoderConfig *oxcf,
+                                                   AV1_PRIMARY *ppi) {
+  PRIMARY_RATE_CONTROL *p_rc = &ppi->p_rc;
+  const RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
+
   const int64_t bandwidth = rc_cfg->target_bandwidth;
   const int64_t starting = rc_cfg->starting_buffer_level_ms;
   const int64_t optimal = rc_cfg->optimal_buffer_level_ms;
   const int64_t maximum = rc_cfg->maximum_buffer_size_ms;
 
-  rc->starting_buffer_level = starting * bandwidth / 1000;
-  rc->optimal_buffer_level =
+  p_rc->starting_buffer_level = starting * bandwidth / 1000;
+  p_rc->optimal_buffer_level =
       (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
-  rc->maximum_buffer_size =
+  p_rc->maximum_buffer_size =
       (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+
+  // Under a configuration change, where maximum_buffer_size may change,
+  // keep buffer level clipped to the maximum allowed buffer size.
+  p_rc->bits_off_target =
+      AOMMIN(p_rc->bits_off_target, p_rc->maximum_buffer_size);
+  p_rc->buffer_level = AOMMIN(p_rc->buffer_level, p_rc->maximum_buffer_size);
 }
 
 static AOM_INLINE void config_target_level(AV1_COMP *const cpi,
                                            AV1_LEVEL target_level, int tier) {
-  aom_clear_system_state();
-
   AV1EncoderConfig *const oxcf = &cpi->oxcf;
-  SequenceHeader *const seq_params = &cpi->common.seq_params;
+  SequenceHeader *const seq_params = cpi->common.seq_params;
   TileConfig *const tile_cfg = &oxcf->tile_cfg;
   RateControlCfg *const rc_cfg = &oxcf->rc_cfg;
 
@@ -48,11 +74,11 @@
       av1_get_max_bitrate_for_level(target_level, tier, profile);
   const int64_t max_bitrate = (int64_t)(level_bitrate_limit * 0.70);
   rc_cfg->target_bandwidth = AOMMIN(rc_cfg->target_bandwidth, max_bitrate);
-  // Also need to update cpi->twopass.bits_left.
-  TWO_PASS *const twopass = &cpi->twopass;
+  // Also need to update cpi->ppi->twopass.bits_left.
+  TWO_PASS *const twopass = &cpi->ppi->twopass;
   FIRSTPASS_STATS *stats = twopass->stats_buf_ctx->total_stats;
   if (stats != NULL)
-    cpi->twopass.bits_left =
+    cpi->ppi->twopass.bits_left =
         (int64_t)(stats->duration * rc_cfg->target_bandwidth / 10000000.0);
 
   // Adjust max over-shoot percentage.
@@ -154,13 +180,21 @@
   return factor;
 }
 
-static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low,
-                                                int q_high, int top_index,
+static AOM_INLINE int get_regulated_q_overshoot(AV1_COMP *const cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                                int is_encode_stage,
+#endif
+                                                int q_low, int q_high,
+                                                int top_index,
                                                 int bottom_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
 
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                        is_encode_stage,
+#endif
+                                        cm->width, cm->height);
 
   int q_regulated =
       av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
@@ -168,7 +202,11 @@
 
   int retries = 0;
   while (q_regulated < q_low && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                          is_encode_stage,
+#endif
+                                          cm->width, cm->height);
     q_regulated =
         av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
                           AOMMAX(q_high, top_index), cm->width, cm->height);
@@ -178,18 +216,29 @@
 }
 
 static AOM_INLINE int get_regulated_q_undershoot(AV1_COMP *const cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                                 int is_encode_stage,
+#endif
                                                  int q_high, int top_index,
                                                  int bottom_index) {
   const AV1_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
 
-  av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+  av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                        is_encode_stage,
+#endif
+                                        cm->width, cm->height);
   int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
                                       top_index, cm->width, cm->height);
 
   int retries = 0;
   while (q_regulated > q_high && retries < 10) {
-    av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+    av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                          is_encode_stage,
+#endif
+                                          cm->width, cm->height);
     q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
                                     top_index, cm->width, cm->height);
     retries++;
@@ -226,6 +275,7 @@
     int *const low_cr_seen, const int loop_count) {
   AV1_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
   *loop = 0;
 
@@ -236,7 +286,6 @@
 
   const int min_cr = rc_cfg->min_cr;
   if (min_cr > 0) {
-    aom_clear_system_state();
     const double compression_ratio =
         av1_get_compression_ratio(cm, rc->projected_frame_size >> 3);
     const double target_cr = min_cr / 100.0;
@@ -263,14 +312,15 @@
                                    &frame_over_shoot_limit);
   if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
-  if (cm->current_frame.frame_type == KEY_FRAME && rc->this_key_frame_forced &&
+  if (cm->current_frame.frame_type == KEY_FRAME &&
+      p_rc->this_key_frame_forced &&
       rc->projected_frame_size < rc->max_frame_bandwidth) {
     int64_t kf_err;
     const int64_t high_err_target = cpi->ambient_err;
     const int64_t low_err_target = cpi->ambient_err >> 1;
 
 #if CONFIG_AV1_HIGHBITDEPTH
-    if (cm->seq_params.use_highbitdepth) {
+    if (cm->seq_params->use_highbitdepth) {
       kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
     } else {
       kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
@@ -323,11 +373,11 @@
       if (*q == *q_high &&
           rc->projected_frame_size >= rc->max_frame_bandwidth) {
         const double q_val_high_current =
-            av1_convert_qindex_to_q(*q_high, cm->seq_params.bit_depth);
+            av1_convert_qindex_to_q(*q_high, cm->seq_params->bit_depth);
         const double q_val_high_new =
             q_val_high_current *
             ((double)rc->projected_frame_size / rc->max_frame_bandwidth);
-        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params.bit_depth,
+        *q_high = av1_find_qindex(q_val_high_new, cm->seq_params->bit_depth,
                                   rc->best_quality, rc->worst_quality);
       }
 
@@ -336,19 +386,31 @@
 
       if (*undershoot_seen || loop_count > 2 ||
           (loop_count == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+        av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                              1,
+#endif
+                                              cm->width, cm->height);
 
         *q = (*q_high + *q_low + 1) / 2;
       } else if (loop_count == 2 && frame_is_intra_only(cm)) {
         const int q_mid = (*q_high + *q_low + 1) / 2;
-        const int q_regulated = get_regulated_q_overshoot(
-            cpi, *q_low, *q_high, top_index, bottom_index);
+        const int q_regulated =
+            get_regulated_q_overshoot(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                      1,
+#endif
+                                      *q_low, *q_high, top_index, bottom_index);
         // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
         // transition between loop_count < 2 and loop_count > 2.
         *q = (q_mid + q_regulated + 1) / 2;
       } else {
-        *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
-                                       bottom_index);
+        *q =
+            get_regulated_q_overshoot(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                      1,
+#endif
+                                      *q_low, *q_high, top_index, bottom_index);
       }
 
       *overshoot_seen = 1;
@@ -358,12 +420,20 @@
 
       if (*overshoot_seen || loop_count > 2 ||
           (loop_count == 2 && !frame_is_intra_only(cm))) {
-        av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+        av1_rc_update_rate_correction_factors(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                              1,
+#endif
+                                              cm->width, cm->height);
         *q = (*q_high + *q_low) / 2;
       } else if (loop_count == 2 && frame_is_intra_only(cm)) {
         const int q_mid = (*q_high + *q_low) / 2;
         const int q_regulated =
-            get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+            get_regulated_q_undershoot(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                       1,
+#endif
+                                       *q_high, top_index, bottom_index);
         // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
         // transition between loop_count < 2 and loop_count > 2.
         *q = (q_mid + q_regulated) / 2;
@@ -376,7 +446,11 @@
           *q_low = *q;
         }
       } else {
-        *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+        *q = get_regulated_q_undershoot(cpi,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                                        1,
+#endif
+                                        *q_high, top_index, bottom_index);
 
         // Special case reset for qlow for constrained quality.
         // This should only trigger where there is very substantial

diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index 6260ade..d88f563 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c

@@ -19,7 +19,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/bitops.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
@@ -354,11 +353,44 @@
   160, 160, 160, 160, 192, 208, 224
 };
 
-int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
-  const int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-  int rdmult = (int)(((int64_t)88 * q * q) / 24);
+// Returns the default rd multiplier for inter frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_inter_rd_multiplier(int qindex) {
+  return 3.2 + (0.0035 * (double)qindex);
+}
 
-  switch (cpi->common.seq_params.bit_depth) {
+// Returns the default rd multiplier for ARF/Golden Frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_arf_rd_multiplier(int qindex) {
+  return 3.25 + (0.0035 * (double)qindex);
+}
+
+// Returns the default rd multiplier for key frames for a given qindex.
+// The function here is a first pass estimate based on data from
+// a previous Vizer run
+static double def_kf_rd_multiplier(int qindex) {
+  return 3.3 + (0.0035 * (double)qindex);
+}
+
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex) {
+  const int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+  int rdmult = q * q;
+  if (update_type == KF_UPDATE) {
+    double def_rd_q_mult = def_kf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  } else if ((update_type == GF_UPDATE) || (update_type == ARF_UPDATE)) {
+    double def_rd_q_mult = def_arf_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  } else {
+    double def_rd_q_mult = def_inter_rd_multiplier(qindex);
+    rdmult = (int)((double)rdmult * def_rd_q_mult);
+  }
+
+  switch (bit_depth) {
     case AOM_BITS_8: break;
     case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
     case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
@@ -370,11 +402,15 @@
 }
 
 int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
-  int64_t rdmult = av1_compute_rd_mult_based_on_qindex(cpi, qindex);
-  if (is_stat_consumption_stage(cpi) &&
+  const aom_bit_depth_t bit_depth = cpi->common.seq_params->bit_depth;
+  const FRAME_UPDATE_TYPE update_type =
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index];
+  int64_t rdmult =
+      av1_compute_rd_mult_based_on_qindex(bit_depth, update_type, qindex);
+  if (is_stat_consumption_stage(cpi) && !cpi->oxcf.q_cfg.use_fixed_qp_offsets &&
       (cpi->common.current_frame.frame_type != KEY_FRAME)) {
-    const GF_GROUP *const gf_group = &cpi->gf_group;
-    const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+    const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
+    const int boost_index = AOMMIN(15, (cpi->ppi->p_rc.gfu_boost / 100));
     const int layer_depth =
         AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], 6);
 
@@ -387,30 +423,52 @@
   return (int)rdmult;
 }
 
-int av1_get_deltaq_offset(const AV1_COMP *cpi, int qindex, double beta) {
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta) {
   assert(beta > 0.0);
-  int q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
+  int q = av1_dc_quant_QTX(qindex, 0, bit_depth);
   int newq = (int)rint(q / sqrt(beta));
   int orig_qindex = qindex;
+  if (newq == q) {
+    return 0;
+  }
   if (newq < q) {
-    do {
+    while (qindex > 0) {
       qindex--;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq < q && qindex > 0);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq >= q) {
+        break;
+      }
+    }
   } else {
-    do {
+    while (qindex < MAXQ) {
       qindex++;
-      q = av1_dc_quant_QTX(qindex, 0, cpi->common.seq_params.bit_depth);
-    } while (newq > q && qindex < MAXQ);
+      q = av1_dc_quant_QTX(qindex, 0, bit_depth);
+      if (newq <= q) {
+        break;
+      }
+    }
   }
   return qindex - orig_qindex;
 }
 
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex) {
+  curr_qindex = clamp(curr_qindex, delta_q_res, 256 - delta_q_res);
+  const int sign_deltaq_index = curr_qindex - prev_qindex >= 0 ? 1 : -1;
+  const int deltaq_deadzone = delta_q_res / 4;
+  const int qmask = ~(delta_q_res - 1);
+  int abs_deltaq_index = abs(curr_qindex - prev_qindex);
+  abs_deltaq_index = (abs_deltaq_index + deltaq_deadzone) & qmask;
+  int adjust_qindex = prev_qindex + sign_deltaq_index * abs_deltaq_index;
+  adjust_qindex = AOMMAX(adjust_qindex, MINQ + 1);
+  return adjust_qindex;
+}
+
 int av1_get_adaptive_rdmult(const AV1_COMP *cpi, double beta) {
   assert(beta > 0.0);
   const AV1_COMMON *cm = &cpi->common;
   int q = av1_dc_quant_QTX(cm->quant_params.base_qindex, 0,
-                           cm->seq_params.bit_depth);
+                           cm->seq_params->bit_depth);
 
   return (int)(av1_compute_rd_mult(cpi, q) / beta);
 }
@@ -434,7 +492,7 @@
 }
 
 void av1_set_sad_per_bit(const AV1_COMP *cpi, int *sadperbit, int qindex) {
-  switch (cpi->common.seq_params.bit_depth) {
+  switch (cpi->common.seq_params->bit_depth) {
     case AOM_BITS_8: *sadperbit = sad_per_bit_lut_8[qindex]; break;
     case AOM_BITS_10: *sadperbit = sad_per_bit_lut_10[qindex]; break;
     case AOM_BITS_12: *sadperbit = sad_per_bit_lut_12[qindex]; break;
@@ -451,7 +509,7 @@
         av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex) +
             cm->quant_params.y_dc_delta_q,
         0, MAXQ);
-    const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+    const int q = compute_rd_thresh_factor(qindex, cm->seq_params->bit_depth);
 
     for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
       // Threshold here seems unnecessarily harsh but fine given actual
@@ -596,10 +654,17 @@
       frame_is_intra_only(cm) || (cm->current_frame.frame_number & 0x07) == 1;
   int num_planes = av1_num_planes(cm);
 
-  aom_clear_system_state();
-
   rd->RDMULT = av1_compute_rd_mult(
       cpi, cm->quant_params.base_qindex + cm->quant_params.y_dc_delta_q);
+#if CONFIG_RD_COMMAND
+  if (cpi->oxcf.pass == 2) {
+    const RD_COMMAND *rd_command = &cpi->rd_command;
+    if (rd_command->option_ls[rd_command->frame_index] ==
+        RD_OPTION_SET_Q_RDMULT) {
+      rd->RDMULT = rd_command->rdmult_ls[rd_command->frame_index];
+    }
+  }
+#endif  // CONFIG_RD_COMMAND
 
   av1_set_error_per_bit(&x->errorperbit, rd->RDMULT);
 
@@ -1019,12 +1084,16 @@
     const uint8_t *const ref_y_ptr =
         &ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
-    const int this_sad = cpi->fn_ptr[block_size].sdf(
+    const int this_sad = cpi->ppi->fn_ptr[block_size].sdf(
         src_y_ptr, x->plane[0].src.stride, ref_y_ptr, ref_y_stride);
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
     }
+    if (i == 0)
+      x->pred_mv0_sad[ref_frame] = this_sad;
+    else if (i == 1)
+      x->pred_mv1_sad[ref_frame] = this_sad;
   }
 
   // Note the index of the mv that worked best in the reference list.
@@ -1282,27 +1351,12 @@
   rd->thresh_mult[THR_D45_PRED] = 2500;
 }
 
-void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
-                               int (*factor_buf)[MAX_MODES],
-                               int use_adaptive_rd_thresh, BLOCK_SIZE bsize,
-                               THR_MODES best_mode_index) {
-  assert(use_adaptive_rd_thresh > 0);
-  const THR_MODES top_mode = MAX_MODES;
-  const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
-
-  const int bsize_is_1_to_4 = bsize > cm->seq_params.sb_size;
-  BLOCK_SIZE min_size, max_size;
-  if (bsize_is_1_to_4) {
-    // This part handles block sizes with 1:4 and 4:1 aspect ratios
-    // TODO(any): Experiment with threshold update for parent/child blocks
-    min_size = bsize;
-    max_size = bsize;
-  } else {
-    min_size = AOMMAX(bsize - 2, BLOCK_4X4);
-    max_size = AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
-  }
-
-  for (THR_MODES mode = 0; mode < top_mode; ++mode) {
+static INLINE void update_thr_fact(int (*factor_buf)[MAX_MODES],
+                                   THR_MODES best_mode_index,
+                                   THR_MODES mode_start, THR_MODES mode_end,
+                                   BLOCK_SIZE min_size, BLOCK_SIZE max_size,
+                                   int max_rd_thresh_factor) {
+  for (THR_MODES mode = mode_start; mode < mode_end; ++mode) {
     for (BLOCK_SIZE bs = min_size; bs <= max_size; ++bs) {
       int *const fact = &factor_buf[bs][mode];
       if (mode == best_mode_index) {
@@ -1314,6 +1368,32 @@
   }
 }
 
+void av1_update_rd_thresh_fact(
+    const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES],
+    int use_adaptive_rd_thresh, BLOCK_SIZE bsize, THR_MODES best_mode_index,
+    THR_MODES inter_mode_start, THR_MODES inter_mode_end,
+    THR_MODES intra_mode_start, THR_MODES intra_mode_end) {
+  assert(use_adaptive_rd_thresh > 0);
+  const int max_rd_thresh_factor = use_adaptive_rd_thresh * RD_THRESH_MAX_FACT;
+
+  const int bsize_is_1_to_4 = bsize > cm->seq_params->sb_size;
+  BLOCK_SIZE min_size, max_size;
+  if (bsize_is_1_to_4) {
+    // This part handles block sizes with 1:4 and 4:1 aspect ratios
+    // TODO(any): Experiment with threshold update for parent/child blocks
+    min_size = bsize;
+    max_size = bsize;
+  } else {
+    min_size = AOMMAX(bsize - 2, BLOCK_4X4);
+    max_size = AOMMIN(bsize + 2, (int)cm->seq_params->sb_size);
+  }
+
+  update_thr_fact(factor_buf, best_mode_index, inter_mode_start, inter_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
+  update_thr_fact(factor_buf, best_mode_index, intra_mode_start, intra_mode_end,
+                  min_size, max_size, max_rd_thresh_factor);
+}
+
 int av1_get_intra_cost_penalty(int qindex, int qdelta,
                                aom_bit_depth_t bit_depth) {
   const int q = av1_dc_quant_QTX(qindex, qdelta, bit_depth);

diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index bbc668b..7256ff9 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h

@@ -19,6 +19,7 @@
 #include "av1/encoder/block.h"
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/cost.h"
+#include "av1/encoder/ratectrl.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -121,7 +122,12 @@
 
 static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
                                       const RD_STATS *rd_stats_src) {
-  assert(rd_stats_dst->rate != INT_MAX && rd_stats_src->rate != INT_MAX);
+  if (rd_stats_dst->rate == INT_MAX || rd_stats_src->rate == INT_MAX) {
+    // If rd_stats_dst or rd_stats_src has invalid rate, we will make
+    // rd_stats_dst invalid.
+    av1_invalid_rd_stats(rd_stats_dst);
+    return;
+  }
   rd_stats_dst->rate = (int)AOMMIN(
       ((int64_t)rd_stats_dst->rate + (int64_t)rd_stats_src->rate), INT_MAX);
   if (!rd_stats_dst->zero_rate)
@@ -186,7 +192,17 @@
 struct AV1_COMP;
 struct macroblock;
 
-int av1_compute_rd_mult_based_on_qindex(const struct AV1_COMP *cpi, int qindex);
+/*!\brief Compute rdmult based on q index and frame update type
+ *
+ * \param[in]       bit_depth       bit depth
+ * \param[in]       update_type     frame update type
+ * \param[in]       qindex          q index
+ *
+ * \return rdmult
+ */
+int av1_compute_rd_mult_based_on_qindex(aom_bit_depth_t bit_depth,
+                                        FRAME_UPDATE_TYPE update_type,
+                                        int qindex);
 
 int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
 
@@ -223,7 +239,11 @@
 
 void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
                                int (*fact)[MAX_MODES], int rd_thresh,
-                               BLOCK_SIZE bsize, THR_MODES best_mode_index);
+                               BLOCK_SIZE bsize, THR_MODES best_mode_index,
+                               THR_MODES inter_mode_start,
+                               THR_MODES inter_mode_end,
+                               THR_MODES intra_mode_start,
+                               THR_MODES intra_mode_end);
 
 static INLINE void reset_thresh_freq_fact(MACROBLOCK *const x) {
   for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
@@ -279,37 +299,8 @@
 }
 
 // Used to reset the state of tx/mb rd hash information
-static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info,
-                                      int use_inter_txb_hash) {
-  int32_t record_idx;
+static INLINE void reset_hash_records(TxfmSearchInfo *const txfm_info) {
   if (!txfm_info->txb_rd_records) return;
-  // Reset the state for use_inter_txb_hash
-  if (use_inter_txb_hash) {
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_8X8[record_idx].index_start =
-              0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_16X16[record_idx]
-              .index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_32X32[record_idx]
-              .index_start = 0;
-    for (record_idx = 0;
-         record_idx < ((MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)); record_idx++)
-      txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx].num =
-          txfm_info->txb_rd_records->txb_rd_record_64X64[record_idx]
-              .index_start = 0;
-  }
-
-  // Reset the state for use_intra_txb_hash
-  txfm_info->txb_rd_records->txb_rd_record_intra.num =
-      txfm_info->txb_rd_records->txb_rd_record_intra.index_start = 0;
 
   // Reset the state for use_mb_rd_hash
   txfm_info->txb_rd_records->mb_rd_record.num =
@@ -341,7 +332,18 @@
 
 int av1_get_adaptive_rdmult(const struct AV1_COMP *cpi, double beta);
 
-int av1_get_deltaq_offset(const struct AV1_COMP *cpi, int qindex, double beta);
+int av1_get_deltaq_offset(aom_bit_depth_t bit_depth, int qindex, double beta);
+
+/*!\brief Adjust current superblock's q_index based on delta q resolution
+ *
+ * \param[in]       delta_q_res       delta q resolution
+ * \param[in]       prev_qindex       previous superblock's q index
+ * \param[in]       curr_qindex       current superblock's q index
+ *
+ * \return the current superblock's adjusted q_index
+ */
+int av1_adjust_q_from_delta_q_res(int delta_q_res, int prev_qindex,
+                                  int curr_qindex);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index f462b65..0da6596 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -22,7 +22,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/cfl.h"
@@ -363,7 +362,6 @@
 static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
                              int64_t sse, int *est_residue_cost,
                              int64_t *est_dist) {
-  aom_clear_system_state();
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   if (md->ready) {
     if (sse < md->dist_mean) {
@@ -396,7 +394,6 @@
 }
 
 void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
-  aom_clear_system_state();
   for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
     const int block_idx = inter_mode_data_block_idx(bsize);
     InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
@@ -454,7 +451,6 @@
   if (block_idx == -1) return;
   InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
   if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
-    aom_clear_system_state();
     const double ld = (sse - dist) * 1. / residue_cost;
     ++rd_model->num;
     rd_model->dist_sum += dist;
@@ -627,8 +623,8 @@
         get_plane_block_size(mbmi->bsize, pd->subsampling_x, pd->subsampling_y);
     unsigned int sse;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
     if (!plane && sse_y) *sse_y = sse;
   }
@@ -1156,24 +1152,25 @@
 
     int_mv best_mv;
     av1_single_motion_search(cpi, x, bsize, ref_idx, rate_mv, search_range,
-                             mode_info, &best_mv);
+                             mode_info, &best_mv, args);
     if (best_mv.as_int == INVALID_MV) return INT64_MAX;
 
     args->single_newmv[ref_mv_idx][refs[0]] = best_mv;
     args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
     args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
     cur_mv[0].as_int = best_mv.as_int;
+
+    // Return after single_newmv is set.
+    if (mode_info[mbmi->ref_mv_idx].skip) return INT64_MAX;
   }
 
   return 0;
 }
 
-static INLINE void update_mode_start_end_index(const AV1_COMP *const cpi,
-                                               int *mode_index_start,
-                                               int *mode_index_end,
-                                               int last_motion_mode_allowed,
-                                               int interintra_allowed,
-                                               int eval_motion_mode) {
+static INLINE void update_mode_start_end_index(
+    const AV1_COMP *const cpi, const MB_MODE_INFO *const mbmi,
+    int *mode_index_start, int *mode_index_end, int last_motion_mode_allowed,
+    int interintra_allowed, int eval_motion_mode) {
   *mode_index_start = (int)SIMPLE_TRANSLATION;
   *mode_index_end = (int)last_motion_mode_allowed + interintra_allowed;
   if (cpi->sf.winner_mode_sf.motion_mode_for_winner_cand) {
@@ -1185,6 +1182,8 @@
       *mode_index_start = 1;
     }
   }
+  if (cpi->sf.inter_sf.extra_prune_warped && mbmi->bsize > BLOCK_16X16)
+    *mode_index_end = SIMPLE_TRANSLATION;
 }
 
 /*!\brief AV1 motion mode search
@@ -1276,7 +1275,7 @@
   uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
   uint8_t best_tx_type_map[MAX_MIB_SIZE * MAX_MIB_SIZE];
   const int rate_mv0 = *rate_mv;
-  const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+  const int interintra_allowed = cm->seq_params->enable_interintra_compound &&
                                  is_interintra_allowed(mbmi) &&
                                  mbmi->compound_idx;
   WARP_SAMPLE_INFO *const warp_sample_info =
@@ -1287,7 +1286,6 @@
   assert(mbmi->ref_frame[1] != INTRA_FRAME);
   const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
   av1_invalid_rd_stats(&best_rd_stats);
-  aom_clear_system_state();
   mbmi->num_proj_ref = 1;  // assume num_proj_ref >=1
   MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
   *yrd = INT64_MAX;
@@ -1319,7 +1317,7 @@
   const int switchable_rate =
       av1_is_interp_needed(xd)
           ? av1_get_switchable_rate(x, xd, interp_filter,
-                                    cm->seq_params.enable_dual_filter)
+                                    cm->seq_params->enable_dual_filter)
           : 0;
   int64_t best_rd = INT64_MAX;
   int best_rate_mv = rate_mv0;
@@ -1330,7 +1328,7 @@
   // if SIMPLE_TRANSLATION has already been searched according to
   // the motion_mode_for_winner_cand speed feature, update the mode_index_start
   // to avoid searching it again.
-  update_mode_start_end_index(cpi, &mode_index_start, &mode_index_end,
+  update_mode_start_end_index(cpi, mbmi, &mode_index_start, &mode_index_end,
                               last_motion_mode_allowed, interintra_allowed,
                               eval_motion_mode);
   // Main function loop. This loops over all of the possible motion modes and
@@ -1356,11 +1354,11 @@
     // Do not search OBMC if the probability of selecting it is below a
     // predetermined threshold for this update_type and block size.
     const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
-    const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
-                           cpi->sf.inter_sf.prune_obmc_prob_thresh;
-    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc ||
-         cpi->sf.rt_sf.use_nonrd_pick_mode || prune_obmc) &&
+        get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+    const int prune_obmc =
+        cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
+        cpi->sf.inter_sf.prune_obmc_prob_thresh;
+    if ((!cpi->oxcf.motion_mode_cfg.enable_obmc || prune_obmc) &&
         mbmi->motion_mode == OBMC_CAUSAL)
       continue;
 
@@ -1374,7 +1372,7 @@
       assert(!is_comp_pred);
       if (have_newmv_in_inter_mode(this_mode)) {
         av1_single_motion_search(cpi, x, bsize, 0, &tmp_rate_mv, INT_MAX, NULL,
-                                 &mbmi->mv[0]);
+                                 &mbmi->mv[0], NULL);
         tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
       }
       if ((mbmi->mv[0].as_int != cur_mv) || eval_motion_mode) {
@@ -1737,6 +1735,41 @@
   return 1;
 }
 
+// Skip NEARESTMV and NEARMV modes based on refmv weight computed in ref mv list
+// population
+static INLINE int skip_nearest_near_mv_using_refmv_weight(
+    const MACROBLOCK *const x, const PREDICTION_MODE this_mode,
+    const int8_t ref_frame_type) {
+  if (this_mode != NEARESTMV && this_mode != NEARMV) return 0;
+
+  const MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
+  const uint16_t *const ref_mv_weight = mbmi_ext->weight[ref_frame_type];
+  const int ref_mv_count =
+      AOMMIN(MAX_REF_MV_SEARCH, mbmi_ext->ref_mv_count[ref_frame_type]);
+
+  if (ref_mv_count == 0) return 0;
+  // If ref mv list has atleast one nearest candidate do not prune NEARESTMV
+  if (this_mode == NEARESTMV && ref_mv_weight[0] >= REF_CAT_LEVEL) return 0;
+
+  // Count number of ref mvs populated from nearest candidates
+  int nearest_refmv_count = 0;
+  for (int ref_mv_idx = 0; ref_mv_idx < ref_mv_count; ref_mv_idx++) {
+    if (ref_mv_weight[ref_mv_idx] >= REF_CAT_LEVEL) nearest_refmv_count++;
+  }
+
+  // nearest_refmv_count indicates the closeness of block motion characteristics
+  // with respect to its spatial neighbor. Smaller value of nearest_refmv_count
+  // w.r.t to ref_mv_count means less correlation with its spatial neighbors.
+  // Hence less possibility for NEARESTMV and NEARMV modes becoming the best
+  // mode since these modes work well for blocks that shares similar motion
+  // characteristics with its neighbor. Thus, NEARMV mode is pruned when
+  // nearest_refmv_count is relatively smaller than ref_mv_count and NEARESTMV
+  // mode is pruned if none of the ref mvs are populated from nearest candidate.
+  const int prune_thresh = 1 + (ref_mv_count >= 2);
+  if (nearest_refmv_count < prune_thresh) return 1;
+  return 0;
+}
+
 // This function update the non-new mv for the current prediction mode
 static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
                                const AV1_COMMON *cm, const MACROBLOCK *x,
@@ -1898,10 +1931,11 @@
 }
 
 // Compute the estimated RD cost for the motion vector with simple translation.
-static int64_t simple_translation_pred_rd(
-    AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats,
-    HandleInterModeArgs *args, int ref_mv_idx, inter_mode_info *mode_info,
-    int64_t ref_best_rd, BLOCK_SIZE bsize) {
+static int64_t simple_translation_pred_rd(AV1_COMP *const cpi, MACROBLOCK *x,
+                                          RD_STATS *rd_stats,
+                                          HandleInterModeArgs *args,
+                                          int ref_mv_idx, int64_t ref_best_rd,
+                                          BLOCK_SIZE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
@@ -1934,7 +1968,6 @@
   const int drl_cost =
       get_drl_cost(mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
   rd_stats->rate += drl_cost;
-  mode_info[ref_mv_idx].drl_cost = drl_cost;
 
   int_mv cur_mv[2];
   if (!build_cur_mv(cur_mv, mbmi->mode, cm, x, 0)) {
@@ -1988,8 +2021,8 @@
 static int ref_mv_idx_to_search(AV1_COMP *const cpi, MACROBLOCK *x,
                                 RD_STATS *rd_stats,
                                 HandleInterModeArgs *const args,
-                                int64_t ref_best_rd, inter_mode_info *mode_info,
-                                BLOCK_SIZE bsize, const int ref_set) {
+                                int64_t ref_best_rd, BLOCK_SIZE bsize,
+                                const int ref_set) {
   AV1_COMMON *const cm = &cpi->common;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -2028,7 +2061,7 @@
       continue;
     }
     idx_rdcost[ref_mv_idx] = simple_translation_pred_rd(
-        cpi, x, rd_stats, args, ref_mv_idx, mode_info, ref_best_rd, bsize);
+        cpi, x, rd_stats, args, ref_mv_idx, ref_best_rd, bsize);
   }
   // Find the index with the best RD cost.
   int best_idx = 0;
@@ -2174,18 +2207,12 @@
     PruneInfoFromTpl *inter_cost_info_from_tpl) {
   AV1_COMMON *const cm = &cpi->common;
 
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
-  if (tpl_idx >= MAX_TPL_FRAME_IDX) {
-    return;
-  }
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  if (!av1_tpl_stats_ready(tpl_data, tpl_idx)) return;
   const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_idx];
-  if (!tpl_frame->is_valid) {
-    return;
-  }
-
   const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
   const int mi_wide = mi_size_wide[bsize];
   const int mi_high = mi_size_high[bsize];
@@ -2278,101 +2305,6 @@
   return 0;
 }
 
-// If the current mode being searched is NEWMV, this function will look
-// at previously searched MVs and check if they are the same
-// as the current MV. If it finds that this MV is repeated, it compares
-// the cost to the previous MV and skips the rest of the search if it is
-// more expensive.
-static int skip_repeated_newmv(
-    AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-    const int do_tx_search, const PREDICTION_MODE this_mode,
-    MB_MODE_INFO *best_mbmi, motion_mode_candidate *motion_mode_cand,
-    int64_t *ref_best_rd, RD_STATS *best_rd_stats, RD_STATS *best_rd_stats_y,
-    RD_STATS *best_rd_stats_uv, inter_mode_info *mode_info,
-    HandleInterModeArgs *args, int drl_cost, const int *refs, int_mv *cur_mv,
-    int64_t *best_rd, const BUFFER_SET orig_dst, int ref_mv_idx) {
-  // This feature only works for NEWMV when a previous mv has been searched
-  if (this_mode != NEWMV || ref_mv_idx == 0) return 0;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const AV1_COMMON *cm = &cpi->common;
-  const int num_planes = av1_num_planes(cm);
-
-  int skip = 0;
-  int this_rate_mv = 0;
-  int i;
-  for (i = 0; i < ref_mv_idx; ++i) {
-    // Check if the motion search result same as previous results
-    if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int &&
-        args->single_newmv_valid[i][refs[0]]) {
-      // If the compared mode has no valid rd, it is unlikely this
-      // mode will be the best mode
-      if (mode_info[i].rd == INT64_MAX) {
-        skip = 1;
-        break;
-      }
-      // Compare the cost difference including drl cost and mv cost
-      if (mode_info[i].mv.as_int != INVALID_MV) {
-        const int compare_cost = mode_info[i].rate_mv + mode_info[i].drl_cost;
-        const int_mv ref_mv = av1_get_ref_mv(x, 0);
-        this_rate_mv = av1_mv_bit_cost(
-            &mode_info[i].mv.as_mv, &ref_mv.as_mv, x->mv_costs->nmv_joint_cost,
-            x->mv_costs->mv_cost_stack, MV_COST_WEIGHT);
-        const int this_cost = this_rate_mv + drl_cost;
-
-        if (compare_cost <= this_cost) {
-          // Skip this mode if it is more expensive as the previous result
-          // for this MV
-          skip = 1;
-          break;
-        } else {
-          // If the cost is less than current best result, make this
-          // the best and update corresponding variables unless the
-          // best_mv is the same as ref_mv. In this case we skip and
-          // rely on NEAR(EST)MV instead
-          if (best_mbmi->ref_mv_idx == i &&
-              best_mbmi->mv[0].as_int != ref_mv.as_int) {
-            assert(*best_rd != INT64_MAX);
-            assert(best_mbmi->mv[0].as_int == mode_info[i].mv.as_int);
-            best_mbmi->ref_mv_idx = ref_mv_idx;
-            motion_mode_cand->rate_mv = this_rate_mv;
-            best_rd_stats->rate += this_cost - compare_cost;
-            *best_rd =
-                RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
-            // We also need to update mode_info here because we are setting
-            // (ref_)best_rd here. So we will not be able to search the same
-            // mode again with the current configuration.
-            mode_info[ref_mv_idx].mv.as_int = best_mbmi->mv[0].as_int;
-            mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-            mode_info[ref_mv_idx].rd = *best_rd;
-            if (*best_rd < *ref_best_rd) *ref_best_rd = *best_rd;
-            break;
-          }
-        }
-      }
-    }
-  }
-  if (skip) {
-    const THR_MODES mode_enum = get_prediction_mode_idx(
-        best_mbmi->mode, best_mbmi->ref_frame[0], best_mbmi->ref_frame[1]);
-    // Collect mode stats for multiwinner mode processing
-    store_winner_mode_stats(
-        &cpi->common, x, best_mbmi, best_rd_stats, best_rd_stats_y,
-        best_rd_stats_uv, mode_enum, NULL, bsize, *best_rd,
-        cpi->sf.winner_mode_sf.multi_winner_mode_type, do_tx_search);
-    args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
-        args->modelled_rd[this_mode][i][refs[0]];
-    args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
-        args->simple_rd[this_mode][i][refs[0]];
-    mode_info[ref_mv_idx].rd = mode_info[i].rd;
-    mode_info[ref_mv_idx].rate_mv = this_rate_mv;
-    mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
-
-    restore_dst_buf(xd, orig_dst, num_planes);
-    return 1;
-  }
-  return 0;
-}
-
 /*!\brief High level function to select parameters for compound mode.
  *
  * \ingroup inter_mode_search
@@ -2431,7 +2363,7 @@
   MB_MODE_INFO *mbmi = xd->mi[0];
   const AV1_COMMON *cm = &cpi->common;
   const int masked_compound_used = is_any_masked_compound_used(bsize) &&
-                                   cm->seq_params.enable_masked_compound;
+                                   cm->seq_params->enable_masked_compound;
   int mode_search_mask = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
                          (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
 
@@ -2510,6 +2442,76 @@
   return 0;
 }
 
+/*!\brief Prunes ZeroMV Search Using Best NEWMV's SSE
+ *
+ * \ingroup inter_mode_search
+ *
+ * Compares the sse of zero mv and the best sse found in single new_mv. If the
+ * sse of the zero_mv is higher, returns 1 to signal zero_mv can be skipped.
+ * Else returns 0.
+ *
+ * Note that the sse of here comes from single_motion_search. So it is
+ * interpolated with the filter in motion search, not the actual interpolation
+ * filter used in encoding.
+ *
+ * \param[in]     fn_ptr            A table of function pointers to compute SSE.
+ * \param[in]     x                 Pointer to struct holding all the data for
+ *                                  the current macroblock.
+ * \param[in]     bsize             The current block_size.
+ * \param[in]     args              The args to handle_inter_mode, used to track
+ *                                  the best SSE.
+ * \return Returns 1 if zero_mv is pruned, 0 otherwise.
+ */
+static AOM_INLINE int prune_zero_mv_with_sse(
+    const aom_variance_fn_ptr_t *fn_ptr, const MACROBLOCK *x, BLOCK_SIZE bsize,
+    const HandleInterModeArgs *args) {
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const MB_MODE_INFO *mbmi = xd->mi[0];
+
+  const int is_comp_pred = has_second_ref(mbmi);
+  const MV_REFERENCE_FRAME *refs = mbmi->ref_frame;
+
+  // Check that the global mv is the same as ZEROMV
+  assert(mbmi->mv[0].as_int == 0);
+  assert(IMPLIES(is_comp_pred, mbmi->mv[0].as_int == 0));
+  assert(xd->global_motion[refs[0]].wmtype == TRANSLATION ||
+         xd->global_motion[refs[0]].wmtype == IDENTITY);
+
+  // Don't prune if we have invalid data
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    assert(mbmi->mv[0].as_int == 0);
+    if (args->best_single_sse_in_refs[refs[idx]] == INT32_MAX) {
+      return 0;
+    }
+  }
+
+  // Sum up the sse of ZEROMV and best NEWMV
+  unsigned int this_sse_sum = 0;
+  unsigned int best_sse_sum = 0;
+  for (int idx = 0; idx < 1 + is_comp_pred; idx++) {
+    const struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y];
+    const struct macroblockd_plane *pd = xd->plane;
+    const struct buf_2d *src_buf = &p->src;
+    const struct buf_2d *ref_buf = &pd->pre[idx];
+    const uint8_t *src = src_buf->buf;
+    const uint8_t *ref = ref_buf->buf;
+    const int src_stride = src_buf->stride;
+    const int ref_stride = ref_buf->stride;
+
+    unsigned int this_sse;
+    fn_ptr[bsize].vf(ref, ref_stride, src, src_stride, &this_sse);
+    this_sse_sum += this_sse;
+
+    const unsigned int best_sse = args->best_single_sse_in_refs[refs[idx]];
+    best_sse_sum += best_sse;
+  }
+  if (this_sse_sum > best_sse_sum) {
+    return 1;
+  }
+
+  return 0;
+}
+
 /*!\brief AV1 inter mode RD computation
  *
  * \ingroup inter_mode_search
@@ -2593,11 +2595,14 @@
   const int is_comp_pred = has_second_ref(mbmi);
   const PREDICTION_MODE this_mode = mbmi->mode;
 
-  const int tpl_idx = cpi->gf_frame_index;
-  TplParams *const tpl_data = &cpi->tpl_data;
+#if CONFIG_REALTIME_ONLY
+  const int prune_modes_based_on_tpl = 0;
+#else   // CONFIG_REALTIME_ONLY
+  const TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const int prune_modes_based_on_tpl =
       cpi->sf.inter_sf.prune_inter_modes_based_on_tpl &&
-      tpl_idx < MAX_TPL_FRAME_IDX && tpl_data->tpl_frame[tpl_idx].is_valid;
+      av1_tpl_stats_ready(tpl_data, cpi->gf_frame_index);
+#endif  // CONFIG_REALTIME_ONLY
   int i;
   // Reference frames for this mode
   const int refs[2] = { mbmi->ref_frame[0],
@@ -2609,10 +2614,10 @@
   // of these currently holds the best predictor, and use the other
   // one for future predictions. In the end, copy from tmp_buf to
   // dst if necessary.
-  struct macroblockd_plane *p = xd->plane;
+  struct macroblockd_plane *pd = xd->plane;
   const BUFFER_SET orig_dst = {
-    { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-    { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+    { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+    { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
   };
   const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
                                  tmp_buf + 2 * MAX_SB_SQUARE },
@@ -2648,8 +2653,8 @@
   // Save MV results from first 2 ref_mv_idx.
   int_mv save_mv[MAX_REF_MV_SEARCH - 1][2];
   int best_ref_mv_idx = -1;
-  const int idx_mask = ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd,
-                                            mode_info, bsize, ref_set);
+  const int idx_mask =
+      ref_mv_idx_to_search(cpi, x, rd_stats, args, ref_best_rd, bsize, ref_set);
   const int16_t mode_ctx =
       av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
   const ModeCosts *mode_costs = &x->mode_costs;
@@ -2657,6 +2662,16 @@
   const int base_rate =
       args->ref_frame_cost + args->single_comp_cost + ref_mv_cost;
 
+  // As per the experiments, in real-time preset impact of model rd based
+  // breakouts is less on encoding time if the following conditions are true.
+  //    (1) compound mode is disabled
+  //    (2) interpolation filter search is disabled
+  // TODO(any): Check the impact of model rd based breakouts in other presets
+  const int skip_interp_search_modelrd_calc =
+      cpi->oxcf.mode == REALTIME &&
+      cm->current_frame.reference_mode == SINGLE_REFERENCE &&
+      cpi->sf.rt_sf.skip_interp_filter_search;
+
   for (i = 0; i < MAX_REF_MV_SEARCH - 1; ++i) {
     save_mv[i][0].as_int = INVALID_MV;
     save_mv[i][1].as_int = INVALID_MV;
@@ -2672,9 +2687,14 @@
   //        WARPED_CAUSAL)
   //    6.) Update stats if best so far
   for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+    mbmi->ref_mv_idx = ref_mv_idx;
+
     mode_info[ref_mv_idx].full_search_mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
-    mode_info[ref_mv_idx].rd = INT64_MAX;
+    mode_info[ref_mv_idx].full_mv_bestsme = INT_MAX;
+    const int drl_cost = get_drl_cost(
+        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
+    mode_info[ref_mv_idx].drl_cost = drl_cost;
+    mode_info[ref_mv_idx].skip = 0;
 
     if (!mask_check_bit(idx_mask, ref_mv_idx)) {
       // MV did not perform well in simple translation search. Skip it.
@@ -2698,14 +2718,10 @@
 
     mbmi->num_proj_ref = 0;
     mbmi->motion_mode = SIMPLE_TRANSLATION;
-    mbmi->ref_mv_idx = ref_mv_idx;
 
     // Compute cost for signalling this DRL index
     rd_stats->rate = base_rate;
-    const int drl_cost = get_drl_cost(
-        mbmi, mbmi_ext, mode_costs->drl_mode_cost0, ref_frame_type);
     rd_stats->rate += drl_cost;
-    mode_info[ref_mv_idx].drl_cost = drl_cost;
 
     int rs = 0;
     int compmode_interinter_cost = 0;
@@ -2734,17 +2750,16 @@
 
       if (newmv_ret_val != 0) continue;
 
-      rd_stats->rate += rate_mv;
+      if (is_inter_singleref_mode(this_mode) &&
+          cur_mv[0].as_int != INVALID_MV) {
+        const MV_REFERENCE_FRAME ref = refs[0];
+        const unsigned int this_sse = x->pred_sse[ref];
+        if (this_sse < args->best_single_sse_in_refs[ref]) {
+          args->best_single_sse_in_refs[ref] = this_sse;
+        }
+      }
 
-      // skip NEWMV mode in drl if the motion search result is the same
-      // as a previous result
-      if (cpi->sf.inter_sf.skip_repeated_newmv &&
-          skip_repeated_newmv(cpi, x, bsize, do_tx_search, this_mode,
-                              &best_mbmi, motion_mode_cand, &ref_best_rd,
-                              &best_rd_stats, &best_rd_stats_y,
-                              &best_rd_stats_uv, mode_info, args, drl_cost,
-                              refs, cur_mv, &best_rd, orig_dst, ref_mv_idx))
-        continue;
+      rd_stats->rate += rate_mv;
     }
     // Copy the motion vector for this mode into mbmi struct
     for (i = 0; i < is_comp_pred + 1; ++i) {
@@ -2763,9 +2778,14 @@
                                 cpi->sf.inter_sf.prune_ref_mv_idx_search))
       continue;
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, compound_type_rd_time);
-#endif
+    if (cpi->sf.gm_sf.prune_zero_mv_with_sse &&
+        cpi->sf.gm_sf.gm_search_type == GM_DISABLE_SEARCH &&
+        (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+      if (prune_zero_mv_with_sse(cpi->ppi->fn_ptr, x, bsize, args)) {
+        continue;
+      }
+    }
+
     int skip_build_pred = 0;
     const int mi_row = xd->mi_row;
     const int mi_col = xd->mi_col;
@@ -2773,50 +2793,54 @@
     // Handle a compound predictor, continue if it is determined this
     // cannot be the best compound mode
     if (is_comp_pred) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, compound_type_rd_time);
+#endif
       const int not_best_mode = process_compound_inter_mode(
           cpi, x, args, ref_best_rd, cur_mv, bsize, &compmode_interinter_cost,
           rd_buffers, &orig_dst, &tmp_dst, &rate_mv, rd_stats, skip_rd,
           &skip_build_pred);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, compound_type_rd_time);
+#endif
       if (not_best_mode) continue;
     }
 
+    if (!skip_interp_search_modelrd_calc) {
 #if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, compound_type_rd_time);
+      start_timing(cpi, interpolation_filter_search_time);
 #endif
+      // Determine the interpolation filter for this mode
+      ret_val = av1_interpolation_filter_search(
+          x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
+          &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, interpolation_filter_search_time);
+#endif
+      if (args->modelled_rd != NULL && !is_comp_pred) {
+        args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+      }
+      if (ret_val != 0) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
+                 ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+        restore_dst_buf(xd, orig_dst, num_planes);
+        continue;
+      }
 
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    start_timing(cpi, interpolation_filter_search_time);
-#endif
-    // Determine the interpolation filter for this mode
-    ret_val = av1_interpolation_filter_search(
-        x, cpi, tile_data, bsize, &tmp_dst, &orig_dst, &rd, &rs,
-        &skip_build_pred, args, ref_best_rd);
-#if CONFIG_COLLECT_COMPONENT_TIMING
-    end_timing(cpi, interpolation_filter_search_time);
-#endif
-    if (args->modelled_rd != NULL && !is_comp_pred) {
-      args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
-    }
-    if (ret_val != 0) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    } else if (cpi->sf.inter_sf.model_based_post_interp_filter_breakout &&
-               ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
-      restore_dst_buf(xd, orig_dst, num_planes);
-      continue;
-    }
-
-    // Compute modelled RD if enabled
-    if (args->modelled_rd != NULL) {
-      if (is_comp_pred) {
-        const int mode0 = compound_ref0_mode(this_mode);
-        const int mode1 = compound_ref1_mode(this_mode);
-        const int64_t mrd =
-            AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
-                   args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
-        if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
-          restore_dst_buf(xd, orig_dst, num_planes);
-          continue;
+      // Compute modelled RD if enabled
+      if (args->modelled_rd != NULL) {
+        if (is_comp_pred) {
+          const int mode0 = compound_ref0_mode(this_mode);
+          const int mode1 = compound_ref1_mode(this_mode);
+          const int64_t mrd =
+              AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+                     args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+          if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+            restore_dst_buf(xd, orig_dst, num_planes);
+            continue;
+          }
         }
       }
     }
@@ -2846,12 +2870,6 @@
 
     if (ret_val != INT64_MAX) {
       int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
-      if (tmp_rd < mode_info[ref_mv_idx].rd) {
-        // Only update mode_info if the new result is actually better.
-        mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
-        mode_info[ref_mv_idx].rate_mv = rate_mv;
-        mode_info[ref_mv_idx].rd = tmp_rd;
-      }
       const THR_MODES mode_enum = get_prediction_mode_idx(
           mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
       // Collect mode stats for multiwinner mode processing
@@ -2918,7 +2936,8 @@
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
                                        int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
-  if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc)
+  if (!av1_allow_intrabc(cm) || !cpi->oxcf.kf_cfg.enable_intrabc ||
+      cpi->sf.rt_sf.use_nonrd_pick_mode)
     return INT64_MAX;
   const int num_planes = av1_num_planes(cm);
 
@@ -2931,8 +2950,8 @@
   const int mi_col = xd->mi_col;
   const int w = block_size_wide[bsize];
   const int h = block_size_high[bsize];
-  const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
-  const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+  const int sb_row = mi_row >> cm->seq_params->mib_size_log2;
+  const int sb_col = mi_col >> cm->seq_params->mib_size_log2;
 
   MB_MODE_INFO_EXT *const mbmi_ext = &x->mbmi_ext;
   const MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
@@ -2955,7 +2974,7 @@
 
   int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
   if (dv_ref.as_int == 0) {
-    av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row);
+    av1_find_ref_dv(&dv_ref, tile, cm->seq_params->mib_size, mi_row);
   }
   // Ref DV should not have sub-pel.
   assert((dv_ref.as_mv.col & 7) == 0);
@@ -3000,19 +3019,19 @@
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         fullms_params.mv_limits.row_max =
-            (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+            (sb_row * cm->seq_params->mib_size - mi_row) * MI_SIZE - h;
         break;
       case IBC_MOTION_LEFT:
         fullms_params.mv_limits.col_min =
             (tile->mi_col_start - mi_col) * MI_SIZE;
         fullms_params.mv_limits.col_max =
-            (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+            (sb_col * cm->seq_params->mib_size - mi_col) * MI_SIZE - w;
         // TODO(aconverse@google.com): Minimize the overlap between above and
         // left areas.
         fullms_params.mv_limits.row_min =
             (tile->mi_row_start - mi_row) * MI_SIZE;
         int bottom_coded_mi_edge =
-            AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+            AOMMIN((sb_row + 1) * cm->seq_params->mib_size, tile->mi_row_end);
         fullms_params.mv_limits.row_max =
             (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
         break;
@@ -3050,7 +3069,7 @@
                                 get_fullmv_from_mv(&dv)))
       continue;
     if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
-                         cm->seq_params.mib_size_log2))
+                         cm->seq_params->mib_size_log2))
       continue;
 
     // DV should not have sub-pel.
@@ -3661,6 +3680,13 @@
     disable_reference(INTRA_FRAME, mask->ref_combo);
   }
 
+  if (!cpi->oxcf.tool_cfg.enable_global_motion) {
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      mask->pred_modes[ref_frame] |= (1 << GLOBALMV);
+      mask->pred_modes[ref_frame] |= (1 << GLOBAL_GLOBALMV);
+    }
+  }
+
   mask->pred_modes[INTRA_FRAME] |=
       ~(sf->intra_sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
 }
@@ -3811,8 +3837,8 @@
 
   av1_count_overlappable_neighbors(cm, xd);
   const FRAME_UPDATE_TYPE update_type =
-      get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
-  const int prune_obmc = cpi->frame_probs.obmc_probs[update_type][bsize] <
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  const int prune_obmc = cpi->ppi->frame_probs.obmc_probs[update_type][bsize] <
                          cpi->sf.inter_sf.prune_obmc_prob_thresh;
   if (cpi->oxcf.motion_mode_cfg.enable_obmc && !prune_obmc) {
     if (check_num_overlappable_neighbors(mbmi) &&
@@ -3844,6 +3870,10 @@
   set_mode_eval_params(cpi, x, MODE_EVAL);
 
   x->comp_rd_stats_idx = 0;
+
+  for (int idx = 0; idx < REF_FRAMES; idx++) {
+    args->best_single_sse_in_refs[idx] = INT32_MAX;
+  }
 }
 
 static AOM_INLINE void init_inter_mode_search_state(
@@ -4089,8 +4119,17 @@
     const MACROBLOCKD *const xd = &x->e_mbd;
     if (search_state->best_rd != INT64_MAX && xd->left_available &&
         xd->up_available) {
+      const int thresholds[PRUNE_NEARMV_MAX][3] = { { 1, 0, 0 },
+                                                    { 1, 1, 0 },
+                                                    { 2, 1, 0 } };
+      const int qindex_sub_range = x->qindex * 3 / QINDEX_RANGE;
+
+      assert(sf->inter_sf.prune_nearmv_using_neighbors <= PRUNE_NEARMV_MAX &&
+             qindex_sub_range < 3);
       const int num_ref_frame_pair_match_thresh =
-          2 - (x->qindex * 3 / QINDEX_RANGE);
+          thresholds[sf->inter_sf.prune_nearmv_using_neighbors - 1]
+                    [qindex_sub_range];
+
       assert(num_ref_frame_pair_match_thresh <= 2 &&
              num_ref_frame_pair_match_thresh >= 0);
       int num_ref_frame_pair_match = 0;
@@ -4099,10 +4138,7 @@
       num_ref_frame_pair_match +=
           match_ref_frame_pair(xd->above_mbmi, ref_frame);
 
-      // Prune modes if:
-      // num_ref_frame_pair_match < 2 for qindex   0 to 85
-      // num_ref_frame_pair_match < 1 for qindex  86 to 170
-      // No pruning for qindex 171 to 255
+      // Pruning based on ref frame pair match with neighbors.
       if (num_ref_frame_pair_match < num_ref_frame_pair_match_thresh) return 1;
     }
   }
@@ -4422,12 +4458,14 @@
 // Prune compound mode using ref frames of neighbor blocks.
 static INLINE int compound_skip_using_neighbor_refs(
     MACROBLOCKD *const xd, const PREDICTION_MODE this_mode,
-    const MV_REFERENCE_FRAME *ref_frames, int prune_compound_using_neighbors) {
+    const MV_REFERENCE_FRAME *ref_frames, int prune_ext_comp_using_neighbors) {
   // Exclude non-extended compound modes from pruning
   if (this_mode == NEAREST_NEARESTMV || this_mode == NEAR_NEARMV ||
       this_mode == NEW_NEWMV || this_mode == GLOBAL_GLOBALMV)
     return 0;
 
+  if (prune_ext_comp_using_neighbors >= 3) return 1;
+
   int is_ref_match[2] = { 0 };  // 0 - match for forward refs
                                 // 1 - match for backward refs
   // Check if ref frames of this block matches with left neighbor.
@@ -4442,7 +4480,7 @@
   const int track_ref_match = is_ref_match[0] + is_ref_match[1];
 
   // Pruning based on ref frame match with neighbors.
-  if (track_ref_match >= prune_compound_using_neighbors) return 0;
+  if (track_ref_match >= prune_ext_comp_using_neighbors) return 0;
   return 1;
 }
 
@@ -4599,10 +4637,10 @@
     if (!is_inter_singleref_mode(mbmi->mode)) continue;
 
     x->txfm_search_info.skip_txfm = 0;
-    struct macroblockd_plane *p = xd->plane;
+    struct macroblockd_plane *pd = xd->plane;
     const BUFFER_SET orig_dst = {
-      { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
-      { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+      { pd[0].dst.buf, pd[1].dst.buf, pd[2].dst.buf },
+      { pd[0].dst.stride, pd[1].dst.stride, pd[2].dst.stride },
     };
 
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
@@ -4651,8 +4689,6 @@
   int skip_ref_frame_mask;
   int reach_first_comp_mode;
   int mode_thresh_mul_fact;
-  int intra_mode_idx_ls[INTRA_MODES];
-  int intra_mode_num;
   int num_single_modes_processed;
   int prune_cpd_using_sr_stats_ready;
 } InterModeSFArgs;
@@ -4663,7 +4699,6 @@
                            InterModeSFArgs *args) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = xd->mi[0];
   // Get the actual prediction mode we are trying in this iteration
   const THR_MODES mode_enum = av1_default_mode_order[midx];
   const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -4673,6 +4708,8 @@
   const MV_REFERENCE_FRAME second_ref_frame = ref_frames[1];
   const int comp_pred = second_ref_frame > INTRA_FRAME;
 
+  if (ref_frame == INTRA_FRAME) return 1;
+
   // Check if this mode should be skipped because it is incompatible with the
   // current frame
   if (inter_mode_compatible_skip(cpi, x, bsize, this_mode, ref_frames))
@@ -4709,23 +4746,6 @@
       return 1;
   }
 
-  // Speed features to prune out INTRA frames
-  if (ref_frame == INTRA_FRAME) {
-    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
-         sf->intra_sf.disable_smooth_intra) &&
-        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
-         mbmi->mode == SMOOTH_V_PRED))
-      return 1;
-    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
-        mbmi->mode == PAETH_PRED)
-      return 1;
-
-    // Intra modes will be handled in another loop later.
-    assert(args->intra_mode_num < INTRA_MODES);
-    args->intra_mode_idx_ls[args->intra_mode_num++] = mode_enum;
-    return 1;
-  }
-
   if (sf->inter_sf.prune_compound_using_single_ref && comp_pred) {
     // After we done with single reference modes, find the 2nd best RD
     // for a reference frame. Only search compound modes that have a reference
@@ -4740,10 +4760,16 @@
       return 1;
   }
 
-  if (sf->inter_sf.prune_compound_using_neighbors && comp_pred) {
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  if (sf->inter_sf.skip_ext_comp_nearmv_mode &&
+      (this_mode == NEW_NEARMV || this_mode == NEAR_NEWMV)) {
+    return 1;
+  }
+
+  if (sf->inter_sf.prune_ext_comp_using_neighbors && comp_pred) {
     if (compound_skip_using_neighbor_refs(
             xd, this_mode, ref_frames,
-            sf->inter_sf.prune_compound_using_neighbors))
+            sf->inter_sf.prune_ext_comp_using_neighbors))
       return 1;
   }
 
@@ -4754,6 +4780,12 @@
       return 1;
   }
 
+  if (sf->inter_sf.prune_nearest_near_mv_using_refmv_weight && !comp_pred) {
+    const int8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+    if (skip_nearest_near_mv_using_refmv_weight(x, this_mode, ref_frame_type))
+      return 1;
+  }
+
   return 0;
 }
 
@@ -4821,13 +4853,46 @@
           : INT64_MAX;
   *yrd = INT64_MAX;
   int64_t best_rd_in_this_partition = INT64_MAX;
+  int num_inter_mode_cands = inter_modes_info->num;
+  int newmv_mode_evaled = 0;
+  int max_allowed_cands = INT_MAX;
+  if (cpi->sf.inter_sf.limit_inter_mode_cands) {
+    // The bound on the no. of inter mode candidates, beyond which the
+    // candidates are limited if a newmv mode got evaluated, is set as
+    // max_allowed_cands + 1.
+    const int num_allowed_cands[4] = { INT_MAX, 10, 9, 6 };
+    assert(cpi->sf.inter_sf.limit_inter_mode_cands <= 3);
+    max_allowed_cands =
+        num_allowed_cands[cpi->sf.inter_sf.limit_inter_mode_cands];
+  }
+
+  int num_mode_thresh = INT_MAX;
+  if (cpi->sf.inter_sf.limit_txfm_eval_per_mode) {
+    // Bound the no. of transform searches per prediction mode beyond a
+    // threshold.
+    const int num_mode_thresh_ary[3] = { INT_MAX, 4, 3 };
+    assert(cpi->sf.inter_sf.limit_txfm_eval_per_mode <= 2);
+    num_mode_thresh =
+        num_mode_thresh_ary[cpi->sf.inter_sf.limit_txfm_eval_per_mode];
+  }
+
+  int num_tx_cands = 0;
+  int num_tx_search_modes[INTER_MODE_END - INTER_MODE_START] = { 0 };
   // Iterate over best inter mode candidates and perform tx search
-  for (int j = 0; j < inter_modes_info->num; ++j) {
+  for (int j = 0; j < num_inter_mode_cands; ++j) {
     const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
     *mbmi = inter_modes_info->mbmi_arr[data_idx];
     int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
     if (curr_est_rd * 0.80 > top_est_rd) break;
 
+    if (num_tx_cands > num_mode_thresh) {
+      if ((mbmi->mode != NEARESTMV &&
+           num_tx_search_modes[mbmi->mode - INTER_MODE_START] >= 1) ||
+          (mbmi->mode == NEARESTMV &&
+           num_tx_search_modes[mbmi->mode - INTER_MODE_START] >= 2))
+        continue;
+    }
+
     txfm_info->skip_txfm = 0;
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 
@@ -4861,6 +4926,9 @@
       if (!eval_txfm) continue;
     }
 
+    num_tx_cands++;
+    if (have_newmv_in_inter_mode(mbmi->mode)) newmv_mode_evaled = 1;
+    num_tx_search_modes[mbmi->mode - INTER_MODE_START]++;
     int64_t this_yrd = INT64_MAX;
     // Do the transform search
     if (!av1_txfm_search(cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
@@ -4900,7 +4968,31 @@
       update_search_state(search_state, rd_cost, ctx, &rd_stats, &rd_stats_y,
                           &rd_stats_uv, mode_enum, x, txfm_search_done);
       search_state->best_skip_rd[0] = skip_rd;
+      // Limit the total number of modes to be evaluated if the first is valid
+      // and transform skip or compound
+      if (cpi->sf.inter_sf.inter_mode_txfm_breakout) {
+        if (!j && (search_state->best_mbmode.skip_txfm || rd_stats.skip_txfm)) {
+          // Evaluate more candidates at high quantizers where occurrence of
+          // transform skip is high.
+          const int max_cands_cap[5] = { 2, 3, 5, 7, 9 };
+          const int qindex_band = (5 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands =
+              AOMMIN(max_cands_cap[qindex_band], inter_modes_info->num);
+        } else if (!j && has_second_ref(&search_state->best_mbmode)) {
+          const int aggr = cpi->sf.inter_sf.inter_mode_txfm_breakout - 1;
+          // Evaluate more candidates at low quantizers where occurrence of
+          // single reference mode is high.
+          const int max_cands_cap_cmp[2][4] = { { 10, 7, 5, 4 },
+                                                { 10, 7, 5, 3 } };
+          const int qindex_band_cmp = (4 * x->qindex) >> QINDEX_BITS;
+          num_inter_mode_cands = AOMMIN(
+              max_cands_cap_cmp[aggr][qindex_band_cmp], inter_modes_info->num);
+        }
+      }
     }
+    // If the number of candidates evaluated exceeds max_allowed_cands, break if
+    // a newmv mode was evaluated already.
+    if ((num_tx_cands > max_allowed_cands) && newmv_mode_evaled) break;
   }
 }
 
@@ -4955,10 +5047,8 @@
  * \ingroup intra_mode_search
  *
  * This function searches for the best intra mode when the current frame is an
- * interframe. The list of luma intra mode candidates to be searched are stored
- * in InterModeSFArgs::intra_mode_idx_ls. This function however does *not*
- * handle luma palette mode. Palette mode is currently handled by \ref
- * av1_search_palette_mode.
+ * interframe. This function however does *not* handle luma palette mode.
+ * Palette mode is currently handled by \ref av1_search_palette_mode.
  *
  * This function will first iterate through the luma mode candidates to find the
  * best luma intra mode. Once the best luma mode it's found, it will then search
@@ -5020,13 +5110,36 @@
   const int num_4x4 = bsize_to_num_blk(bsize);
 
   // Performs luma search
-  for (int j = 0; j < sf_args->intra_mode_num; ++j) {
+  int64_t best_model_rd = INT64_MAX;
+  int64_t top_intra_model_rd[TOP_INTRA_MODEL_COUNT];
+  for (int i = 0; i < TOP_INTRA_MODEL_COUNT; i++) {
+    top_intra_model_rd[i] = INT64_MAX;
+  }
+  for (int mode_idx = 0; mode_idx < LUMA_MODE_COUNT; ++mode_idx) {
     if (sf->intra_sf.skip_intra_in_interframe &&
         search_state->intra_search_state.skip_intra_modes)
       break;
-    const THR_MODES mode_enum = sf_args->intra_mode_idx_ls[j];
-    const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
-    const PREDICTION_MODE this_mode = mode_def->mode;
+    set_y_mode_and_delta_angle(mode_idx, mbmi);
+    assert(mbmi->mode < INTRA_MODE_END);
+
+    // Use intra_y_mode_mask speed feature to skip intra mode evaluation.
+    if (sf_args->mode_skip_mask->pred_modes[INTRA_FRAME] & (1 << mbmi->mode))
+      continue;
+
+    const THR_MODES mode_enum =
+        get_prediction_mode_idx(mbmi->mode, INTRA_FRAME, NONE_FRAME);
+    if ((!cpi->oxcf.intra_mode_cfg.enable_smooth_intra ||
+         cpi->sf.intra_sf.disable_smooth_intra) &&
+        (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+         mbmi->mode == SMOOTH_V_PRED))
+      continue;
+    if (!cpi->oxcf.intra_mode_cfg.enable_paeth_intra &&
+        mbmi->mode == PAETH_PRED)
+      continue;
+    if (av1_is_directional_mode(mbmi->mode) &&
+        av1_use_angle_delta(bsize) == 0 && mbmi->angle_delta[PLANE_TYPE_Y] != 0)
+      continue;
+    const PREDICTION_MODE this_mode = mbmi->mode;
 
     assert(av1_mode_defs[mode_enum].ref_frame[0] == INTRA_FRAME);
     assert(av1_mode_defs[mode_enum].ref_frame[1] == NONE_FRAME);
@@ -5054,7 +5167,8 @@
     int64_t intra_rd_y = INT64_MAX;
     const int is_luma_result_valid = av1_handle_intra_y_mode(
         intra_search_state, cpi, x, bsize, intra_ref_frame_cost, ctx,
-        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y);
+        &intra_rd_stats_y, search_state->best_rd, &mode_cost_y, &intra_rd_y,
+        &best_model_rd, top_intra_model_rd);
     if (is_luma_result_valid && intra_rd_y < yrd_threshold) {
       is_best_y_mode_intra = 1;
       if (intra_rd_y < best_rd_y) {
@@ -5117,12 +5231,6 @@
         intra_rd_stats_uv.rate +
         intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
   }
-  if (mode != DC_PRED && mode != PAETH_PRED) {
-    const int intra_cost_penalty = av1_get_intra_cost_penalty(
-        cm->quant_params.base_qindex, cm->quant_params.y_dc_delta_q,
-        cm->seq_params.bit_depth);
-    intra_rd_stats.rate += intra_cost_penalty;
-  }
 
   // Intra block is always coded as non-skip
   intra_rd_stats.skip_txfm = 0;
@@ -5159,6 +5267,95 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
+// Prepare inter_cost and intra_cost from TPL stats, which are used as ML
+// features in intra mode pruning.
+static AOM_INLINE void calculate_cost_from_tpl_data(
+    const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, int64_t *inter_cost, int64_t *intra_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  // Only consider full SB.
+  const BLOCK_SIZE sb_size = cm->seq_params->sb_size;
+  const int tpl_bsize_1d = cpi->ppi->tpl_data.tpl_bsize_1d;
+  const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
+                  (block_size_high[sb_size] / tpl_bsize_1d);
+  SuperBlockEnc *sb_enc = &x->sb_enc;
+  if (sb_enc->tpl_data_count == len) {
+    const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
+    const int tpl_stride = sb_enc->tpl_stride;
+    const int tplw = mi_size_wide[tpl_bsize];
+    const int tplh = mi_size_high[tpl_bsize];
+    const int nw = mi_size_wide[bsize] / tplw;
+    const int nh = mi_size_high[bsize] / tplh;
+    if (nw >= 1 && nh >= 1) {
+      const int of_h = mi_row % mi_size_high[sb_size];
+      const int of_w = mi_col % mi_size_wide[sb_size];
+      const int start = of_h / tplh * tpl_stride + of_w / tplw;
+
+      for (int k = 0; k < nh; k++) {
+        for (int l = 0; l < nw; l++) {
+          *inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
+          *intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
+        }
+      }
+      *inter_cost /= nw * nh;
+      *intra_cost /= nw * nh;
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+// When the speed feature skip_intra_in_interframe > 0, enable ML model to prune
+// intra mode search.
+static AOM_INLINE void skip_intra_modes_in_interframe(
+    AV1_COMMON *const cm, struct macroblock *x, BLOCK_SIZE bsize,
+    InterModeSearchState *search_state, int64_t inter_cost, int64_t intra_cost,
+    int skip_intra_in_interframe) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  // Prune intra search based on best inter mode being transfrom skip.
+  if ((skip_intra_in_interframe >= 2) && search_state->best_mbmode.skip_txfm) {
+    const int qindex_thresh[2] = { 200, MAXQ };
+    const int ind = (skip_intra_in_interframe >= 3) ? 1 : 0;
+    if (!have_newmv_in_inter_mode(search_state->best_mbmode.mode) &&
+        (x->qindex <= qindex_thresh[ind])) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    } else if ((skip_intra_in_interframe >= 4) &&
+               (inter_cost < 0 || intra_cost < 0)) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+      return;
+    }
+  }
+  // Use ML model to prune intra search.
+  if (inter_cost >= 0 && intra_cost >= 0) {
+    const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
+                                     ? &av1_intrap_nn_config
+                                     : &av1_intrap_hd_nn_config;
+    float nn_features[6];
+    float scores[2] = { 0.0f };
+
+    nn_features[0] = (float)search_state->best_mbmode.skip_txfm;
+    nn_features[1] = (float)mi_size_wide_log2[bsize];
+    nn_features[2] = (float)mi_size_high_log2[bsize];
+    nn_features[3] = (float)intra_cost;
+    nn_features[4] = (float)inter_cost;
+    const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+    const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
+    nn_features[5] = (float)(ac_q_max / ac_q);
+
+    av1_nn_predict(nn_features, nn_config, 1, scores);
+
+    // For two parameters, the max prob returned from av1_nn_softmax equals
+    // 1.0 / (1.0 + e^(-|diff_score|)). Here use scores directly to avoid the
+    // calling of av1_nn_softmax.
+    const float thresh[5] = { 1.4f, 1.4f, 1.4f, 1.4f, 1.4f };
+    assert(skip_intra_in_interframe <= 5);
+    if (scores[1] > scores[0] + thresh[skip_intra_in_interframe - 1]) {
+      search_state->intra_search_state.skip_intra_modes = 1;
+    }
+  }
+}
+
 // TODO(chiyotsai@google.com): See the todo for av1_rd_pick_intra_mode_sb.
 void av1_rd_pick_inter_mode(struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
                             struct macroblock *x, struct RD_STATS *rd_cost,
@@ -5201,6 +5398,7 @@
                                -1,
                                -1,
                                -1,
+                               { 0 },
                                { 0 } };
   for (i = 0; i < MODE_CTX_REF_FRAMES; ++i) args.cmp_mode[i] = -1;
   // Indicates the appropriate number of simple translation winner modes for
@@ -5235,10 +5433,13 @@
          mbmi->partition != PARTITION_HORZ) ||
         cpi->sf.inter_sf.prune_ref_frame_for_rect_partitions >= 2) {
       picked_ref_frames_mask =
-          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params.mib_size);
+          fetch_picked_ref_frames_mask(x, bsize, cm->seq_params->mib_size);
     }
   }
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
   // Skip ref frames that never selected by square blocks.
   const int skip_ref_frame_mask =
       picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
@@ -5250,6 +5451,9 @@
   set_params_rd_pick_inter_mode(cpi, x, &args, bsize, &mode_skip_mask,
                                 skip_ref_frame_mask, ref_costs_single,
                                 ref_costs_comp, yv12_mb);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, set_params_rd_pick_inter_mode_time);
+#endif
 
   int64_t best_est_rd = INT64_MAX;
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
@@ -5307,40 +5511,14 @@
   const int do_pruning =
       (AOMMIN(cm->width, cm->height) > 480 && cpi->speed <= 1) ? 0 : 1;
   if (do_pruning && sf->intra_sf.skip_intra_in_interframe &&
-      cpi->oxcf.algo_cfg.enable_tpl_model) {
-    // Only consider full SB.
-    const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
-    const int tpl_bsize_1d = cpi->tpl_data.tpl_bsize_1d;
-    const int len = (block_size_wide[sb_size] / tpl_bsize_1d) *
-                    (block_size_high[sb_size] / tpl_bsize_1d);
-    SuperBlockEnc *sb_enc = &x->sb_enc;
-    if (sb_enc->tpl_data_count == len) {
-      const BLOCK_SIZE tpl_bsize = convert_length_to_bsize(tpl_bsize_1d);
-      const int tpl_stride = sb_enc->tpl_stride;
-      const int tplw = mi_size_wide[tpl_bsize];
-      const int tplh = mi_size_high[tpl_bsize];
-      const int nw = mi_size_wide[bsize] / tplw;
-      const int nh = mi_size_high[bsize] / tplh;
-      if (nw >= 1 && nh >= 1) {
-        const int of_h = mi_row % mi_size_high[sb_size];
-        const int of_w = mi_col % mi_size_wide[sb_size];
-        const int start = of_h / tplh * tpl_stride + of_w / tplw;
-
-        for (int k = 0; k < nh; k++) {
-          for (int l = 0; l < nw; l++) {
-            inter_cost += sb_enc->tpl_inter_cost[start + k * tpl_stride + l];
-            intra_cost += sb_enc->tpl_intra_cost[start + k * tpl_stride + l];
-          }
-        }
-        inter_cost /= nw * nh;
-        intra_cost /= nw * nh;
-      }
-    }
-  }
+      cpi->oxcf.algo_cfg.enable_tpl_model)
+    calculate_cost_from_tpl_data(cpi, x, bsize, mi_row, mi_col, &inter_cost,
+                                 &intra_cost);
 #endif  // !CONFIG_REALTIME_ONLY
 
   // Initialize best mode stats for winner mode processing
-  av1_zero_array(x->winner_mode_stats, MAX_WINNER_MODE_COUNT_INTER);
+  zero_winner_mode_stats(bsize, MAX_WINNER_MODE_COUNT_INTER,
+                         x->winner_mode_stats);
   x->winner_mode_count = 0;
   store_winner_mode_stats(&cpi->common, x, mbmi, NULL, NULL, NULL, THR_INVALID,
                           NULL, bsize, best_rd_so_far,
@@ -5359,20 +5537,29 @@
                               skip_ref_frame_mask,
                               0,
                               mode_thresh_mul_fact,
-                              { 0 },
-                              0,
                               0,
                               0 };
   int64_t best_inter_yrd = INT64_MAX;
 
-  // This is the main loop of this function. It loops over all possible modes
-  // and calls handle_inter_mode() to compute the RD for each.
+  // This is the main loop of this function. It loops over all possible inter
+  // modes and calls handle_inter_mode() to compute the RD for each.
   // Here midx is just an iterator index that should not be used by itself
   // except to keep track of the number of modes searched. It should be used
   // with av1_default_mode_order to get the enum that defines the mode, which
   // can be used with av1_mode_defs to get the prediction mode and the ref
   // frames.
-  for (THR_MODES midx = THR_MODE_START; midx < THR_MODE_END; ++midx) {
+  // TODO(yunqing, any): Setting mode_start and mode_end outside for-loop brings
+  // good speedup for real time case. If we decide to use compound mode in real
+  // time, maybe we can modify av1_default_mode_order table.
+  THR_MODES mode_start = THR_INTER_MODE_START;
+  THR_MODES mode_end = THR_INTER_MODE_END;
+  const CurrentFrame *const current_frame = &cm->current_frame;
+  if (current_frame->reference_mode == SINGLE_REFERENCE) {
+    mode_start = SINGLE_REF_MODE_START;
+    mode_end = SINGLE_REF_MODE_END;
+  }
+
+  for (THR_MODES midx = mode_start; midx < mode_end; ++midx) {
     // Get the actual prediction mode we are trying in this iteration
     const THR_MODES mode_enum = av1_default_mode_order[midx];
     const MODE_DEFINITION *mode_def = &av1_mode_defs[mode_enum];
@@ -5390,9 +5577,16 @@
     txfm_info->skip_txfm = 0;
     sf_args.num_single_modes_processed += is_single_pred;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, skip_inter_mode_time);
+#endif
     // Apply speed features to decide if this inter mode can be skipped
-    if (skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args)) continue;
+    const int is_skip_inter_mode =
+        skip_inter_mode(cpi, x, bsize, ref_frame_rd, midx, &sf_args);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, skip_inter_mode_time);
+#endif
+    if (is_skip_inter_mode) continue;
 
     // Select prediction reference frames.
     for (i = 0; i < num_planes; i++) {
@@ -5519,36 +5713,11 @@
   // Gate intra mode evaluation if best of inter is skip except when source
   // variance is extremely low
   const unsigned int src_var_thresh_intra_skip = 1;
-  if (sf->intra_sf.skip_intra_in_interframe &&
-      (x->source_variance > src_var_thresh_intra_skip)) {
-    if (inter_cost >= 0 && intra_cost >= 0) {
-      aom_clear_system_state();
-      const NN_CONFIG *nn_config = (AOMMIN(cm->width, cm->height) <= 480)
-                                       ? &av1_intrap_nn_config
-                                       : &av1_intrap_hd_nn_config;
-      float nn_features[6];
-      float scores[2] = { 0.0f };
-      float probs[2] = { 0.0f };
-
-      nn_features[0] = (float)search_state.best_mbmode.skip_txfm;
-      nn_features[1] = (float)mi_size_wide_log2[bsize];
-      nn_features[2] = (float)mi_size_high_log2[bsize];
-      nn_features[3] = (float)intra_cost;
-      nn_features[4] = (float)inter_cost;
-      const int ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
-      const int ac_q_max = av1_ac_quant_QTX(255, 0, xd->bd);
-      nn_features[5] = (float)(ac_q_max / ac_q);
-
-      av1_nn_predict(nn_features, nn_config, 1, scores);
-      aom_clear_system_state();
-      av1_nn_softmax(scores, probs, 2);
-
-      if (probs[1] > 0.8) search_state.intra_search_state.skip_intra_modes = 1;
-    } else if ((search_state.best_mbmode.skip_txfm) &&
-               (sf->intra_sf.skip_intra_in_interframe >= 2)) {
-      search_state.intra_search_state.skip_intra_modes = 1;
-    }
-  }
+  const int skip_intra_in_interframe = sf->intra_sf.skip_intra_in_interframe;
+  if (skip_intra_in_interframe &&
+      (x->source_variance > src_var_thresh_intra_skip))
+    skip_intra_modes_in_interframe(cm, x, bsize, &search_state, inter_cost,
+                                   intra_cost, skip_intra_in_interframe);
 
   const unsigned int intra_ref_frame_cost = ref_costs_single[INTRA_FRAME];
   search_intra_modes_in_interframe(&search_state, cpi, x, rd_cost, bsize, ctx,
@@ -5558,6 +5727,9 @@
   end_timing(cpi, handle_intra_mode_time);
 #endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, refine_winner_mode_tx_time);
+#endif
   int winner_mode_count =
       cpi->sf.winner_mode_sf.multi_winner_mode_type ? x->winner_mode_count : 1;
   // In effect only when fast tx search speed features are enabled.
@@ -5565,6 +5737,9 @@
       cpi, x, rd_cost, bsize, ctx, &search_state.best_mode_index,
       &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
       search_state.best_rate_uv, &search_state.best_skip2, winner_mode_count);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, refine_winner_mode_tx_time);
+#endif
 
   // Initialize default mode evaluation params
   set_mode_eval_params(cpi, x, DEFAULT_EVAL);
@@ -5639,9 +5814,9 @@
          !is_inter_block(&search_state.best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref && cpi->sf.inter_sf.adaptive_rd_thresh) {
-    av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
-                              sf->inter_sf.adaptive_rd_thresh, bsize,
-                              search_state.best_mode_index);
+    av1_update_rd_thresh_fact(
+        cm, x->thresh_freq_fact, sf->inter_sf.adaptive_rd_thresh, bsize,
+        search_state.best_mode_index, mode_start, mode_end, THR_DC, MAX_MODES);
   }
 
   // macroblock modes
@@ -5773,7 +5948,7 @@
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filters = av1_broadcast_interp_filter(i);
         rs = av1_get_switchable_rate(x, xd, interp_filter,
-                                     cm->seq_params.enable_dual_filter);
+                                     cm->seq_params->enable_dual_filter);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filters.as_filters.y_filter;
@@ -5784,7 +5959,7 @@
   // Set the appropriate filter
   mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
   rate2 += av1_get_switchable_rate(x, xd, interp_filter,
-                                   cm->seq_params.enable_dual_filter);
+                                   cm->seq_params->enable_dual_filter);
 
   if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT)
     rate2 += comp_inter_cost[comp_pred];
@@ -5810,7 +5985,8 @@
   if (cpi->sf.inter_sf.adaptive_rd_thresh) {
     av1_update_rd_thresh_fact(cm, x->thresh_freq_fact,
                               cpi->sf.inter_sf.adaptive_rd_thresh, bsize,
-                              THR_GLOBALMV);
+                              THR_GLOBALMV, THR_INTER_MODE_START,
+                              THR_INTER_MODE_END, THR_DC, MAX_MODES);
   }
 
   av1_zero(best_pred_diff);

diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 362da7b..055a49e 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h

@@ -217,10 +217,10 @@
 static INLINE int av1_get_sb_mi_size(const AV1_COMMON *const cm) {
   const int mi_alloc_size_1d = mi_size_wide[cm->mi_params.mi_alloc_bsize];
   int sb_mi_rows =
-      (mi_size_wide[cm->seq_params.sb_size] + mi_alloc_size_1d - 1) /
+      (mi_size_wide[cm->seq_params->sb_size] + mi_alloc_size_1d - 1) /
       mi_alloc_size_1d;
-  assert(mi_size_wide[cm->seq_params.sb_size] ==
-         mi_size_high[cm->seq_params.sb_size]);
+  assert(mi_size_wide[cm->seq_params->sb_size] ==
+         mi_size_high[cm->seq_params->sb_size]);
   int sb_mi_size = sb_mi_rows * sb_mi_rows;
 
   return sb_mi_size;

diff --git a/av1/encoder/rdopt_utils.h b/av1/encoder/rdopt_utils.h
index ddd180f..8dacc2e 100644
--- a/av1/encoder/rdopt_utils.h
+++ b/av1/encoder/rdopt_utils.h

@@ -266,7 +266,8 @@
     return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
                                    [ref_frame];
   }
-  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+  if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END &&
+      second_ref_frame != NONE_FRAME) {
     assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
     assert((second_ref_frame > INTRA_FRAME) &&
            (second_ref_frame <= ALTREF_FRAME));
@@ -386,7 +387,7 @@
   // TODO(any): Move block independent condition checks to frame level
   if (is_inter_block(mbmi)) {
     if (is_inter_mode(best_mode) &&
-        sf->tx_sf.tx_type_search.fast_inter_tx_type_search &&
+        (sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh != INT_MAX) &&
         !cpi->oxcf.txfm_cfg.use_inter_dct_only)
       return 1;
   } else {
@@ -433,8 +434,10 @@
   txfm_params->prune_2d_txfm_mode = sf->tx_sf.tx_type_search.prune_2d_txfm_mode;
   if (!winner_mode_tx_type_pruning) return;
 
-  const int prune_mode[2][2] = { { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
-                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 } };
+  const int prune_mode[4][2] = { { TX_TYPE_PRUNE_3, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_4, TX_TYPE_PRUNE_0 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_2 },
+                                 { TX_TYPE_PRUNE_5, TX_TYPE_PRUNE_3 } };
   txfm_params->prune_2d_txfm_mode =
       prune_mode[winner_mode_tx_type_pruning - 1][is_winner_mode];
 }
@@ -475,7 +478,7 @@
 
   switch (mode_eval_type) {
     case DEFAULT_EVAL:
-      txfm_params->use_default_inter_tx_type = 0;
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
       txfm_params->use_default_intra_tx_type = 0;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[DEFAULT_EVAL];
@@ -497,8 +500,8 @@
       txfm_params->use_default_intra_tx_type =
           (cpi->sf.tx_sf.tx_type_search.fast_intra_tx_type_search ||
            cpi->oxcf.txfm_cfg.use_intra_default_tx_only);
-      txfm_params->use_default_inter_tx_type =
-          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_search;
+      txfm_params->default_inter_tx_type_prob_thresh =
+          cpi->sf.tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[MODE_EVAL];
       txfm_params->predict_dc_level =
@@ -524,7 +527,7 @@
                         0);
       break;
     case WINNER_MODE_EVAL:
-      txfm_params->use_default_inter_tx_type = 0;
+      txfm_params->default_inter_tx_type_prob_thresh = INT_MAX;
       txfm_params->use_default_intra_tx_type = 0;
       txfm_params->skip_txfm_level =
           winner_mode_params->skip_txfm_level[WINNER_MODE_EVAL];
@@ -550,14 +553,7 @@
       set_tx_type_prune(sf, txfm_params,
                         sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning,
                         1);
-
-      // Reset hash state for winner mode processing. Winner mode and subsequent
-      // transform/mode evaluations (palette/IntraBC) cann't reuse old data as
-      // the decisions would have been sub-optimal
-      // TODO(any): Move the evaluation of palette/IntraBC modes before winner
-      // mode is processed and clean-up the code below
-      reset_hash_records(txfm_info, cpi->sf.tx_sf.use_inter_txb_hash);
-
+      reset_hash_records(txfm_info);
       break;
     default: assert(0);
   }
@@ -569,7 +565,7 @@
                                                       const MACROBLOCK *x) {
   const MACROBLOCKD *xd = &x->e_mbd;
 
-  if (cm->seq_params.monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
+  if (cm->seq_params->monochrome || !xd->is_chroma_ref) return CFL_DISALLOWED;
 
   if (!xd->is_chroma_ref) {
     // For non-chroma-reference blocks, we should always store the luma pixels,

diff --git a/av1/encoder/reconinter_enc.c b/av1/encoder/reconinter_enc.c
index 6020b94..ca939de 100644
--- a/av1/encoder/reconinter_enc.c
+++ b/av1/encoder/reconinter_enc.c

@@ -418,3 +418,283 @@
         ext_dst1[plane], ext_dst_stride1[plane]);
   }
 }
+
+// Get pred block from up-sampled reference.
+void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                          int mi_row, int mi_col, const MV *const mv,
+                          uint8_t *comp_pred, int width, int height,
+                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+                          int ref_stride, int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
+                          -1, width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
+                         16, width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
+                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
+                          width, intermediate_height);
+    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
+                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
+                         width, height);
+  }
+}
+
+void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                   int mi_row, int mi_col, const MV *const mv,
+                                   uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height, int subpel_x_q3,
+                                   int subpel_y_q3, const uint8_t *ref,
+                                   int ref_stride, int subpel_search) {
+  int i, j;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred, const uint8_t *pred,
+                                    int width, int height, int subpel_x_q3,
+                                    int subpel_y_q3, const uint8_t *ref,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask,
+                                    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                         subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
+                       mask_stride, invert_mask);
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
+                                 const struct AV1Common *const cm, int mi_row,
+                                 int mi_col, const MV *const mv,
+                                 uint8_t *comp_pred8, int width, int height,
+                                 int subpel_x_q3, int subpel_y_q3,
+                                 const uint8_t *ref8, int ref_stride, int bd,
+                                 int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    for (int i = 0; i < height; i++) {
+      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
+      comp_pred += width;
+      ref += ref_stride;
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz_c(ref8, ref_stride, comp_pred8, width, kernel,
+                                 16, NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert_c(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                                kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter->taps >> 1) - 1),
+                                 ref_stride, CONVERT_TO_BYTEPTR(temp),
+                                 MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
+                                 intermediate_height, bd);
+    aom_highbd_convolve8_vert_c(
+        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
+        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
+        bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  int i, j;
+
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  aom_highbd_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                              height, subpel_x_q3, subpel_y_q3, ref8,
+                              ref_stride, bd, subpel_search);
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+  }
+}
+
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
+                            mask, mask_stride, invert_mask);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH

diff --git a/av1/encoder/reconinter_enc.h b/av1/encoder/reconinter_enc.h
index fdc1f31..468e32b 100644
--- a/av1/encoder/reconinter_enc.h
+++ b/av1/encoder/reconinter_enc.h

@@ -24,6 +24,13 @@
 extern "C" {
 #endif
 
+void aom_highbd_comp_mask_upsampled_pred(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int bd, int subpel_search);
+
 // Build single or compound reference inter predictors for all planes.
 // Can build inter-intra predictors, masked predictors etc as well.
 void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,

diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index de17d57..edb6ef6 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c

@@ -175,6 +175,14 @@
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
   int tile_col, tile_row, mi_row, mi_col;
+
+  if (!seg->update_map) return;
+  if (cm->features.primary_ref_frame == PRIMARY_REF_NONE) {
+    seg->temporal_update = 0;
+    assert(seg->update_data == 1);
+    return;
+  }
+
   unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
   unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
   unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
@@ -194,15 +202,15 @@
                  tile_info.mi_row_start * cm->mi_params.mi_stride +
                  tile_info.mi_col_start;
         for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
-             mi_row += cm->seq_params.mib_size,
-            mi_ptr += cm->seq_params.mib_size * cm->mi_params.mi_stride) {
+             mi_row += cm->seq_params->mib_size,
+            mi_ptr += cm->seq_params->mib_size * cm->mi_params.mi_stride) {
           MB_MODE_INFO **mi = mi_ptr;
           for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
-               mi_col += cm->seq_params.mib_size,
-              mi += cm->seq_params.mib_size) {
+               mi_col += cm->seq_params->mib_size,
+              mi += cm->seq_params->mib_size) {
             count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
                           temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                          mi_col, cm->seq_params.sb_size);
+                          mi_col, cm->seq_params->sb_size);
           }
         }
       }

diff --git a/av1/encoder/sorting_network.h b/av1/encoder/sorting_network.h
new file mode 100644
index 0000000..54f4c19
--- /dev/null
+++ b/av1/encoder/sorting_network.h

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*! \file
+ * This file contains several utility functions used to sort small arrays with
+ * sorting networks.
+ *
+ * Sorting network is a (potentially branch-less) way to quickly sort small
+ * arrays with known size. For more details, consult
+ * (https://en.wikipedia.org/wiki/Sorting_network).
+ */
+#ifndef AOM_AV1_ENCODER_SORTING_NETWORK_H_
+#define AOM_AV1_ENCODER_SORTING_NETWORK_H_
+
+#include "aom/aom_integer.h"
+
+#define SWAP(i, j)                                   \
+  do {                                               \
+    const float maxf = (k[i] >= k[j]) ? k[i] : k[j]; \
+    const float minf = (k[i] >= k[j]) ? k[j] : k[i]; \
+    const int maxi = (k[i] >= k[j]) ? v[i] : v[j];   \
+    const int mini = (k[i] >= k[j]) ? v[j] : v[i];   \
+    k[i] = maxf;                                     \
+    k[j] = minf;                                     \
+    v[i] = maxi;                                     \
+    v[j] = mini;                                     \
+  } while (0)
+
+/*!\brief Sorts two size-16 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-16 array of float serves as the keys.
+ * \param[in,out]    v          An length-16 array of int32 serves as the
+ *                              value.
+ */
+static AOM_INLINE void av1_sort_fi32_16(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(8, 9);
+  SWAP(10, 11);
+  SWAP(12, 13);
+  SWAP(14, 15);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(8, 10);
+  SWAP(9, 11);
+  SWAP(12, 14);
+  SWAP(13, 15);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(9, 10);
+  SWAP(13, 14);
+  SWAP(8, 12);
+  SWAP(11, 15);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(9, 13);
+  SWAP(10, 14);
+  SWAP(0, 8);
+  SWAP(7, 15);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(9, 12);
+  SWAP(11, 14);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(10, 12);
+  SWAP(11, 13);
+  SWAP(1, 9);
+  SWAP(6, 14);
+  SWAP(3, 4);
+  SWAP(11, 12);
+  SWAP(1, 8);
+  SWAP(2, 10);
+  SWAP(5, 13);
+  SWAP(7, 14);
+  SWAP(3, 11);
+  SWAP(2, 8);
+  SWAP(4, 12);
+  SWAP(7, 13);
+  SWAP(3, 10);
+  SWAP(5, 12);
+  SWAP(3, 9);
+  SWAP(6, 12);
+  SWAP(3, 8);
+  SWAP(7, 12);
+  SWAP(5, 9);
+  SWAP(6, 10);
+  SWAP(4, 8);
+  SWAP(7, 11);
+  SWAP(5, 8);
+  SWAP(7, 10);
+  SWAP(6, 8);
+  SWAP(7, 9);
+  SWAP(7, 8);
+}
+
+/*!\brief Sorts two size-8 arrays of keys and values in descending order of
+ * keys.
+ *
+ * \param[in,out]    k          An length-8 array of float serves as the keys.
+ * \param[in,out]    v          An length-8 array of int32 serves as the values.
+ */
+static AOM_INLINE void av1_sort_fi32_8(float k[], int32_t v[]) {
+  SWAP(0, 1);
+  SWAP(2, 3);
+  SWAP(4, 5);
+  SWAP(6, 7);
+  SWAP(0, 2);
+  SWAP(1, 3);
+  SWAP(4, 6);
+  SWAP(5, 7);
+  SWAP(1, 2);
+  SWAP(5, 6);
+  SWAP(0, 4);
+  SWAP(3, 7);
+  SWAP(1, 5);
+  SWAP(2, 6);
+  SWAP(1, 4);
+  SWAP(3, 6);
+  SWAP(2, 4);
+  SWAP(3, 5);
+  SWAP(3, 4);
+}
+#undef SWAP
+#endif  // AOM_AV1_ENCODER_SORTING_NETWORK_H_

diff --git a/av1/encoder/sparse_linear_solver.c b/av1/encoder/sparse_linear_solver.c
index 1c556c2..1600830 100644
--- a/av1/encoder/sparse_linear_solver.c
+++ b/av1/encoder/sparse_linear_solver.c

@@ -8,7 +8,6 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
-#include <float.h>
 #include "av1/common/av1_common_int.h"
 #include "av1/encoder/sparse_linear_solver.h"
 #include "config/aom_config.h"
@@ -16,7 +15,6 @@
 #include "av1/common/alloccommon.h"
 
 #if CONFIG_OPTICAL_FLOW_API
-
 /*
  * Input:
  * rows: array of row positions
@@ -28,10 +26,13 @@
  *
  * Output:
  * sm: pointer to the sparse matrix to be initialized
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
-                         int num_elem, int num_rows, int num_cols,
-                         SPARSE_MTX *sm) {
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm) {
   sm->n_elem = num_elem;
   sm->n_rows = num_rows;
   sm->n_cols = num_cols;
@@ -39,15 +40,22 @@
     sm->row_pos = NULL;
     sm->col_pos = NULL;
     sm->value = NULL;
-    return;
+    return 0;
   }
   sm->row_pos = aom_calloc(num_elem, sizeof(*sm->row_pos));
   sm->col_pos = aom_calloc(num_elem, sizeof(*sm->col_pos));
   sm->value = aom_calloc(num_elem, sizeof(*sm->value));
 
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
   memcpy(sm->row_pos, rows, num_elem * sizeof(*sm->row_pos));
   memcpy(sm->col_pos, cols, num_elem * sizeof(*sm->col_pos));
   memcpy(sm->value, values, num_elem * sizeof(*sm->value));
+
+  return 0;
 }
 
 /*
@@ -61,12 +69,15 @@
  *
  * Output:
  * sm: the combined matrix
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
-                                 SPARSE_MTX *sm, int row_offset1,
-                                 int col_offset1, int row_offset2,
-                                 int col_offset2, int new_n_rows,
-                                 int new_n_cols) {
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols) {
   sm->n_elem = sm1->n_elem + sm2->n_elem;
   sm->n_cols = new_n_cols;
   sm->n_rows = new_n_rows;
@@ -75,12 +86,18 @@
     sm->row_pos = NULL;
     sm->col_pos = NULL;
     sm->value = NULL;
-    return;
+    return 0;
   }
+
   sm->row_pos = aom_calloc(sm->n_elem, sizeof(*sm->row_pos));
   sm->col_pos = aom_calloc(sm->n_elem, sizeof(*sm->col_pos));
   sm->value = aom_calloc(sm->n_elem, sizeof(*sm->value));
 
+  if (!sm->row_pos || !sm->col_pos || !sm->value) {
+    av1_free_sparse_mtx_elems(sm);
+    return -1;
+  }
+
   for (int i = 0; i < sm1->n_elem; i++) {
     sm->row_pos[i] = sm1->row_pos[i] + row_offset1;
     sm->col_pos[i] = sm1->col_pos[i] + col_offset1;
@@ -92,6 +109,7 @@
     sm->col_pos[n_elem1 + i] = sm2->col_pos[i] + col_offset2;
   }
   memcpy(sm->value + n_elem1, sm2->value, sm2->n_elem * sizeof(*sm2->value));
+  return 0;
 }
 
 void av1_free_sparse_mtx_elems(SPARSE_MTX *sm) {
@@ -169,6 +187,19 @@
   }
 }
 
+static INLINE void free_solver_local_buf(double *buf1, double *buf2,
+                                         double *buf3, double *buf4,
+                                         double *buf5, double *buf6,
+                                         double *buf7) {
+  aom_free(buf1);
+  aom_free(buf2);
+  aom_free(buf3);
+  aom_free(buf4);
+  aom_free(buf5);
+  aom_free(buf6);
+  aom_free(buf7);
+}
+
 /*
  * Solve for Ax = b
  * no requirement on A
@@ -177,13 +208,18 @@
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
+ * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
-                                      int bl, double *x) {
-  double *r, *r_hat, *p, *p_hat, *Ap, *p_hatA, *x_hat;
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x) {
+  double *r = NULL, *r_hat = NULL, *p = NULL, *p_hat = NULL, *Ap = NULL,
+         *p_hatA = NULL, *x_hat = NULL;
   double alpha, beta, rtr, r_norm_2;
   double denormtemp;
 
@@ -195,6 +231,10 @@
   Ap = aom_calloc(bl, sizeof(*Ap));
   p_hatA = aom_calloc(bl, sizeof(*p_hatA));
   x_hat = aom_calloc(bl, sizeof(*x_hat));
+  if (!r || !r_hat || !p || !p_hat || !Ap || !p_hatA || !x_hat) {
+    free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+    return -1;
+  }
 
   int i;
   for (i = 0; i < bl; i++) {
@@ -233,13 +273,8 @@
     }
   }
   // free
-  aom_free(r);
-  aom_free(r_hat);
-  aom_free(p);
-  aom_free(p_hat);
-  aom_free(Ap);
-  aom_free(p_hatA);
-  aom_free(x_hat);
+  free_solver_local_buf(r, r_hat, p, p_hat, Ap, p_hatA, x_hat);
+  return 0;
 }
 
 /*
@@ -249,13 +284,17 @@
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
+ * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
-                                   double *x) {
-  double *r, *p, *Ap;
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x) {
+  double *r = NULL, *p = NULL, *Ap = NULL;
   double alpha, beta, rtr, r_norm_2;
   double denormtemp;
 
@@ -263,6 +302,10 @@
   r = aom_calloc(bl, sizeof(*r));
   p = aom_calloc(bl, sizeof(*p));
   Ap = aom_calloc(bl, sizeof(*Ap));
+  if (!r || !p || !Ap) {
+    free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+    return -1;
+  }
 
   int i;
   for (i = 0; i < bl; i++) {
@@ -292,9 +335,9 @@
     }
   }
   // free
-  aom_free(r);
-  aom_free(p);
-  aom_free(Ap);
+  free_solver_local_buf(r, p, Ap, NULL, NULL, NULL, NULL);
+
+  return 0;
 }
 
 /*
@@ -304,18 +347,29 @@
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
+ * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl,
-                       double *x) {
-  double *diags, *Rx, *x_last, *x_cur, *tempx;
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x) {
+  double *diags = NULL, *Rx = NULL, *x_last = NULL, *x_cur = NULL,
+         *tempx = NULL;
   double resi2;
-  diags = aom_calloc(bl, sizeof((*diags)));
+
+  diags = aom_calloc(bl, sizeof(*diags));
   Rx = aom_calloc(bl, sizeof(*Rx));
   x_last = aom_calloc(bl, sizeof(*x_last));
   x_cur = aom_calloc(bl, sizeof(*x_cur));
+
+  if (!diags || !Rx || !x_last || !x_cur) {
+    free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+    return -1;
+  }
+
   int i;
   memset(x_last, 0, sizeof(*x_last) * bl);
   // get the diagonals of A
@@ -348,10 +402,8 @@
   for (i = 0; i < bl; i++) {
     x[i] = x_cur[i];
   }
-  aom_free(diags);
-  aom_free(Rx);
-  aom_free(x_last);
-  aom_free(x_cur);
+  free_solver_local_buf(diags, Rx, x_last, x_cur, NULL, NULL, NULL);
+  return 0;
 }
 
 /*
@@ -361,17 +413,28 @@
  * A: the sparse matrix
  * b: the vector b
  * bl: length of b
+ * x: the vector x
  *
  * Output:
  * x: pointer to the solution vector
+ *
+ * Return: 0  - success
+ *         -1 - failed
  */
-void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
-                                 double *x) {
-  double *d, *Ad, *Ax;
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x) {
+  double *d = NULL, *Ad = NULL, *Ax = NULL;
   double resi2, resi2_last, dAd, diff, temp;
+
   d = aom_calloc(bl, sizeof(*d));
   Ax = aom_calloc(bl, sizeof(*Ax));
   Ad = aom_calloc(bl, sizeof(*Ad));
+
+  if (!d || !Ax || !Ad) {
+    free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+    return -1;
+  }
+
   int i;
   // initialize with 0s
   resi2 = 0;
@@ -403,9 +466,9 @@
       break;
     }
   }
-  aom_free(d);
-  aom_free(Ax);
-  aom_free(Ad);
+  free_solver_local_buf(d, Ax, Ad, NULL, NULL, NULL, NULL);
+
+  return 0;
 }
 
-#endif  // CONFIG_OPFL
+#endif  // CONFIG_OPTICAL_FLOW_API

diff --git a/av1/encoder/sparse_linear_solver.h b/av1/encoder/sparse_linear_solver.h
index 3cacb51..f30fc0f 100644
--- a/av1/encoder/sparse_linear_solver.h
+++ b/av1/encoder/sparse_linear_solver.h

@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_SPARSE_LINEAR_SOLVER_H_
-#define AV1_COMMON_SPARSE_LINEAR_SOLVER_H_
+#ifndef AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
+#define AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,14 +33,14 @@
   double *value;
 } SPARSE_MTX;
 
-void av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
-                         int num_elem, int num_rows, int num_cols,
-                         SPARSE_MTX *sm);
-void av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
-                                 SPARSE_MTX *sm, int row_offset1,
-                                 int col_offset1, int row_offset2,
-                                 int col_offset2, int new_n_rows,
-                                 int new_n_cols);
+int av1_init_sparse_mtx(const int *rows, const int *cols, const double *values,
+                        int num_elem, int num_rows, int num_cols,
+                        SPARSE_MTX *sm);
+int av1_init_combine_sparse_mtx(const SPARSE_MTX *sm1, const SPARSE_MTX *sm2,
+                                SPARSE_MTX *sm, int row_offset1,
+                                int col_offset1, int row_offset2,
+                                int col_offset2, int new_n_rows,
+                                int new_n_cols);
 void av1_free_sparse_mtx_elems(SPARSE_MTX *sm);
 
 void av1_mtx_vect_multi_right(const SPARSE_MTX *sm, const double *srcv,
@@ -50,13 +50,13 @@
 double av1_vect_vect_multi(const double *src1, int src1l, const double *src2);
 void av1_constant_multiply_sparse_matrix(SPARSE_MTX *sm, double c);
 
-void av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
-                                   double *x);
-void av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
-                                      int bl, double *x);
-void av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
-void av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
-                                 double *x);
+int av1_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                  double *x);
+int av1_bi_conjugate_gradient_sparse(const SPARSE_MTX *A, const double *b,
+                                     int bl, double *x);
+int av1_jacobi_sparse(const SPARSE_MTX *A, const double *b, int bl, double *x);
+int av1_steepest_descent_sparse(const SPARSE_MTX *A, const double *b, int bl,
+                                double *x);
 
 #endif  // CONFIG_OPTICAL_FLOW_API
 
@@ -64,4 +64,4 @@
 }  // extern "C"
 #endif
 
-#endif /* AV1_COMMON_SPARSE_LINEAR_SOLVER_H_ */
+#endif /* AOM_AV1_ENCODER_SPARSE_LINEAR_SOLVER_H_ */

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 24e5136..9d5b4de 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c

@@ -92,10 +92,11 @@
 // (Eg : IntraBc) Index 1: Mode evaluation. Index 2: Winner mode evaluation.
 // Index 1 and 2 are applicable when enable_winner_mode_for_tx_size_srch speed
 // feature is ON
-static TX_SIZE_SEARCH_METHOD tx_size_search_methods[3][MODE_EVAL_TYPES] = {
+static TX_SIZE_SEARCH_METHOD tx_size_search_methods[4][MODE_EVAL_TYPES] = {
   { USE_FULL_RD, USE_LARGESTALL, USE_FULL_RD },
   { USE_FAST_RD, USE_LARGESTALL, USE_FULL_RD },
-  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD }
+  { USE_LARGESTALL, USE_LARGESTALL, USE_FULL_RD },
+  { USE_LARGESTALL, USE_LARGESTALL, USE_LARGESTALL }
 };
 
 // Predict transform skip levels to be used for default, mode and winner mode
@@ -274,6 +275,23 @@
 
     sf->part_sf.use_square_partition_only_threshold = BLOCK_16X16;
   }
+
+  if (speed >= 7) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
+
+  if (speed >= 8) {
+    if (!is_480p_or_larger) {
+      sf->rt_sf.nonrd_check_partition_merge_mode = 2;
+    }
+    if (is_720p_or_larger) {
+      sf->rt_sf.force_large_partition_blocks_intra = 1;
+    }
+  }
+
+  if (speed >= 9) {
+    // TODO(kyslov): add more speed features to control speed/quality
+  }
 }
 
 static void set_allintra_speed_features_framesize_independent(
@@ -289,9 +307,13 @@
   sf->part_sf.prune_part4_search = 2;
   sf->part_sf.simple_motion_search_prune_rect = 1;
   sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
 
   sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.prune_luma_palette_size_search_level = 1;
   sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
+  sf->intra_sf.early_term_chroma_palette_size_search = 1;
 
   sf->tx_sf.adaptive_txb_search_level = 1;
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
@@ -301,7 +323,7 @@
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
       cpi->use_screen_content_tools) {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
   } else {
@@ -312,7 +334,8 @@
   sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_DUAL;
 
   if (speed >= 1) {
-    sf->part_sf.intra_cnn_split = 1;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
     sf->part_sf.simple_motion_search_early_term_none = 1;
     // TODO(Venkat): Clean-up frame type dependency for
     // simple_motion_search_split in partition search function and set the
@@ -324,6 +347,8 @@
     sf->mv_sf.exhaustive_searches_thresh <<= 1;
 
     sf->intra_sf.prune_palette_search_level = 1;
+    sf->intra_sf.prune_luma_palette_size_search_level = 2;
+    sf->intra_sf.top_intra_model_count_allowed = 3;
 
     sf->tx_sf.adaptive_txb_search_level = 2;
     sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
@@ -344,12 +369,11 @@
   }
 
   if (speed >= 2) {
-    sf->part_sf.allow_partition_search_skip = 1;
-
     sf->mv_sf.auto_mv_step_size = 1;
 
     sf->intra_sf.disable_smooth_intra = 1;
     sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.prune_filter_intra_level = 1;
 
     sf->rd_sf.perform_coeff_opt = 3;
 
@@ -396,12 +420,6 @@
     sf->part_sf.early_term_after_none_split = 1;
     sf->part_sf.ml_predict_breakout_level = 3;
 
-    sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
     sf->intra_sf.prune_chroma_modes_using_luma_winner = 1;
 
     sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
@@ -410,7 +428,7 @@
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
@@ -433,6 +451,8 @@
     sf->part_sf.simple_motion_search_prune_agg = 3;
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 1 : 2;
 
     sf->intra_sf.chroma_intra_pruning_with_hog = 3;
 
@@ -445,13 +465,13 @@
   }
 
   if (speed >= 6) {
-    sf->intra_sf.disable_filter_intra = 1;
+    sf->intra_sf.prune_filter_intra_level = 2;
     sf->intra_sf.chroma_intra_pruning_with_hog = 4;
     sf->intra_sf.intra_pruning_with_hog = 4;
     sf->intra_sf.cfl_search_range = 1;
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
-        allow_screen_content_tools ? 0 : 1;
+        allow_screen_content_tools ? 0 : 2;
     sf->part_sf.prune_sub_8x8_partition_level =
         allow_screen_content_tools ? 0 : 1;
     sf->part_sf.prune_part4_search = 3;
@@ -461,33 +481,58 @@
 
     sf->mv_sf.use_bsize_dependent_search_method = 1;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 3;
     sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
-    // Use largest txfm block size for square coding blocks.
-    sf->tx_sf.intra_tx_size_search_init_depth_sqr = 2;
-    sf->tx_sf.tx_type_search.use_reduced_intra_txset = 2;
 
     sf->rd_sf.perform_coeff_opt = 6;
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
+    sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
 
     sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
   }
+  // The following should make all-intra mode speed 7 approximately equal
+  // to real-time speed 6,
+  // all-intra speed 8 close to real-time speed 7, and all-intra speed 9
+  // close to real-time speed 8
+  if (speed >= 7) {
+    sf->part_sf.default_min_partition_size = BLOCK_8X8;
+    sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+  }
 
-  // Intra txb hash is currently not compatible with multi-winner mode as the
-  // hashes got reset during multi-winner mode processing.
-  assert(IMPLIES(
-      sf->winner_mode_sf.multi_winner_mode_type != MULTI_WINNER_MODE_OFF,
-      !sf->tx_sf.use_intra_txb_hash));
+  if (speed >= 8) {
+    sf->rt_sf.hybrid_intra_pickmode = 1;
+    sf->rt_sf.use_nonrd_pick_mode = 1;
+    sf->rt_sf.nonrd_check_partition_merge_mode = 1;
+    sf->rt_sf.nonrd_check_partition_split = 0;
+    // Set mask for intra modes.
+    for (int i = 0; i < BLOCK_SIZES; ++i)
+      if (i >= BLOCK_32X32)
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
+      else
+        // Use DC, H, V intra mode for block sizes < 32X32.
+        sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+  }
+
+  if (speed >= 9) {
+    sf->rt_sf.nonrd_check_partition_merge_mode = 0;
+    sf->rt_sf.hybrid_intra_pickmode = 0;
+  }
 }
 
 static void set_good_speed_feature_framesize_dependent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int is_480p_or_lesser = AOMMIN(cm->width, cm->height) <= 480;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_4k_or_larger = AOMMIN(cm->width, cm->height) >= 2160;
   const bool use_hbd = cpi->oxcf.use_highbitdepth;
+  const int boosted = frame_is_boosted(cpi);
+  const int is_lf_frame =
+      cpi->ppi->gf_group.frame_type[cpi->gf_frame_index] == LF_UPDATE;
 
   if (is_480p_or_larger) {
     sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
@@ -521,7 +566,16 @@
     sf->mv_sf.use_downsampled_sad = 1;
   }
 
+  if (!is_720p_or_larger) {
+    const RateControlCfg *const rc_cfg = &cpi->oxcf.rc_cfg;
+    const int rate_tolerance =
+        AOMMIN(rc_cfg->under_shoot_pct, rc_cfg->over_shoot_pct);
+    sf->hl_sf.recode_tolerance = 25 + (rate_tolerance >> 2);
+  }
+
   if (speed >= 1) {
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 1;
+
     if (is_720p_or_larger) {
       sf->part_sf.use_square_partition_only_threshold = BLOCK_128X128;
     } else if (is_480p_or_larger) {
@@ -564,29 +618,75 @@
     }
 
     if (is_480p_or_larger) {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
+    } else {
+      sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+    }
+
+    if (is_480p_or_lesser) sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 1 : 0;
+    } else {
+      sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 2 : 0;
+    }
+
+    if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 1;
       if (use_hbd) sf->tx_sf.prune_tx_size_level = 2;
     } else {
       if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
     }
 
-    if (!is_720p_or_larger) sf->mv_sf.disable_second_mv = 1;
+    if (!is_720p_or_larger) {
+      sf->mv_sf.disable_second_mv = 1;
+    } else {
+      sf->mv_sf.disable_second_mv = boosted ? 0 : 2;
+    }
 
     if (!is_720p_or_larger) sf->hl_sf.recode_tolerance = 50;
   }
 
   if (speed >= 3) {
+    sf->inter_sf.skip_newmv_in_drl = 2;
+    sf->inter_sf.skip_ext_comp_nearmv_mode = 1;
+    sf->inter_sf.limit_inter_mode_cands = is_lf_frame ? 3 : 0;
+
     sf->part_sf.ml_early_term_after_part_split_level = 0;
 
     if (is_720p_or_larger) {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
       sf->part_sf.partition_search_breakout_rate_thr = 200;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 2 : 0;
     } else {
       sf->part_sf.max_intra_bsize = BLOCK_32X32;
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 23);
       sf->part_sf.partition_search_breakout_rate_thr = 120;
+      sf->part_sf.skip_non_sq_part_based_on_none = is_lf_frame ? 1 : 0;
     }
     if (use_hbd) sf->tx_sf.prune_tx_size_level = 3;
+
+    if (is_480p_or_larger) {
+      sf->intra_sf.top_intra_model_count_allowed = 2;
+      sf->part_sf.early_term_after_none_split = 1;
+    } else {
+      sf->part_sf.early_term_after_none_split = 0;
+    }
+    if (is_720p_or_larger) {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 2;
+    } else {
+      sf->intra_sf.skip_intra_in_interframe = boosted ? 1 : 3;
+    }
+
+    if (is_720p_or_larger) {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 1;
+    } else {
+      sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+      sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+    }
+
+    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
   }
 
   if (speed >= 4) {
@@ -595,17 +695,24 @@
     } else {
       sf->part_sf.partition_search_breakout_dist_thr = (1 << 24);
     }
+    sf->part_sf.early_term_after_none_split = 1;
 
     if (is_480p_or_larger) {
       sf->tx_sf.tx_type_search.prune_tx_type_using_stats = 2;
     }
 
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
     sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.limit_txfm_eval_per_mode = boosted ? 0 : 2;
+    if (is_480p_or_lesser) sf->inter_sf.skip_newmv_in_drl = 3;
 
     if (is_720p_or_larger)
       sf->hl_sf.recode_tolerance = 32;
     else
       sf->hl_sf.recode_tolerance = 55;
+
+    sf->intra_sf.top_intra_model_count_allowed = 2;
+    sf->intra_sf.skip_intra_in_interframe = 4;
   }
 
   if (speed >= 5) {
@@ -615,9 +722,27 @@
       sf->inter_sf.prune_warped_prob_thresh = 8;
     }
     if (is_720p_or_larger) sf->hl_sf.recode_tolerance = 40;
+
+    sf->inter_sf.skip_newmv_in_drl = 4;
+
+    if (!is_720p_or_larger) {
+      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+    }
+
+    if (!is_480p_or_larger) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          boosted ? INT_MAX : 250;
+    }
+
+    if (is_480p_or_lesser) {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL1;
+    } else {
+      sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL2;
+    }
   }
 
   if (speed >= 6) {
+    sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
     if (is_720p_or_larger) {
       sf->part_sf.auto_max_partition_based_on_simple_motion = NOT_IN_USE;
     } else if (is_480p_or_larger) {
@@ -633,7 +758,8 @@
     }
 
     if (!is_720p_or_larger) {
-      sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW_SET;
+      sf->inter_sf.coeff_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+      sf->inter_sf.mode_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     }
 
     if (is_720p_or_larger) {
@@ -647,16 +773,22 @@
     } else {
       sf->inter_sf.prune_ref_mv_idx_search = 1;
     }
+
+    if (!is_720p_or_larger) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 150;
+    }
   }
 }
 
 static void set_good_speed_features_framesize_independent(
     const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int boosted = frame_is_boosted(cpi);
   const int is_boosted_arf2_bwd_type =
       boosted || gf_group->update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+  const int is_lf_frame =
+      gf_group->frame_type[cpi->gf_frame_index] == LF_UPDATE;
   const int allow_screen_content_tools =
       cm->features.allow_screen_content_tools;
   const int use_hbd = cpi->oxcf.use_highbitdepth;
@@ -673,6 +805,10 @@
   sf->part_sf.prune_part4_search = 2;
   sf->part_sf.simple_motion_search_prune_rect = 1;
   sf->part_sf.ml_predict_breakout_level = use_hbd ? 1 : 3;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.simple_motion_search_prune_agg =
+      allow_screen_content_tools ? -1 : 0;
 
   // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 1;
@@ -701,7 +837,7 @@
   sf->rt_sf.use_nonrd_pick_mode = 0;
   sf->rt_sf.use_real_time_ref_set = 0;
 
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+  if (cpi->twopass_frame.fr_content_type == FC_GRAPHICS_ANIMATION ||
       cpi->use_screen_content_tools) {
     sf->mv_sf.exhaustive_searches_thresh = (1 << 20);
   } else {
@@ -715,7 +851,8 @@
     sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
     sf->gm_sf.prune_ref_frame_for_gm_search = boosted ? 0 : 1;
 
-    sf->part_sf.intra_cnn_split = 1;
+    sf->part_sf.intra_cnn_based_part_prune_level =
+        allow_screen_content_tools ? 0 : 2;
     sf->part_sf.simple_motion_search_early_term_none = 1;
     // TODO(Venkat): Clean-up frame type dependency for
     // simple_motion_search_split in partition search function and set the
@@ -738,7 +875,6 @@
     sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
     sf->inter_sf.reuse_inter_intra_mode = 1;
     sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
 
     sf->interp_sf.use_interp_filter = 1;
 
@@ -768,7 +904,8 @@
   if (speed >= 2) {
     sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
-    sf->part_sf.allow_partition_search_skip = 1;
+    sf->fp_sf.skip_motion_search_threshold = 25;
+
     sf->part_sf.reuse_best_prediction_for_part_ab =
         !frame_is_intra_only(&cpi->common);
 
@@ -781,12 +918,10 @@
     // clips (e.g. 5% loss on dinner_1080p). We need to examine the sequence a
     // bit more closely to figure out why.
     sf->inter_sf.adaptive_rd_thresh = 1;
-    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
     sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
     sf->inter_sf.fast_interintra_wedge_search = 1;
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 1;
-    sf->inter_sf.prune_compound_using_neighbors = 1;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 1;
     sf->inter_sf.prune_comp_using_best_single_mode_ref = 2;
     sf->inter_sf.prune_comp_type_by_comp_avg = 2;
     sf->inter_sf.selective_ref_frame = 3;
@@ -795,26 +930,31 @@
     sf->inter_sf.enable_fast_compound_mode_search = 1;
     sf->inter_sf.reuse_mask_search_results = 1;
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 1;
+    sf->inter_sf.disable_interinter_wedge_newmv_search =
+        is_boosted_arf2_bwd_type ? 0 : 1;
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 1;
 
-    // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
     sf->interp_sf.adaptive_interp_filter_search = 1;
     sf->interp_sf.disable_dual_filter = 1;
 
     sf->intra_sf.disable_smooth_intra =
         !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key > 1);
     sf->intra_sf.intra_pruning_with_hog = 2;
+    sf->intra_sf.skip_intra_in_interframe = is_lf_frame ? 2 : 1;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
 
     sf->tpl_sf.prune_starting_mv = 1;
     sf->tpl_sf.search_method = DIAMOND;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 3 : 4;
+    sf->rd_sf.use_mb_rd_hash = 1;
 
     sf->lpf_sf.prune_wiener_based_on_src_var = 1;
     sf->lpf_sf.prune_sgr_based_on_wiener = 1;
+    sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
+    sf->lpf_sf.reduce_wiener_window_size = boosted ? 0 : 1;
 
-    // TODO(any): Move this from speed 3 to speed 2 so that TPL multithread
-    // is re-enabled at speed 2. This also makes encoder faster. After TPL MT is
-    // fixed and works with compound pred, we can re-evaluate this feature.
+    // TODO(any): Re-evaluate this feature set to 1 in speed 2.
     sf->tpl_sf.allow_compound_pred = 0;
     sf->tpl_sf.prune_ref_frames_in_tpl = 1;
   }
@@ -825,7 +965,8 @@
     sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
 
     sf->part_sf.less_rectangular_check_level = 2;
-    sf->part_sf.simple_motion_search_prune_agg = 1;
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? 0 : 1;
     sf->part_sf.prune_ext_part_using_split_info = 1;
     sf->part_sf.simple_motion_search_rect_split = 1;
 
@@ -833,12 +974,11 @@
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->mv_sf.search_method = DIAMOND;
     sf->mv_sf.disable_second_mv = 2;
+    sf->mv_sf.reduce_search_range = 1;
 
     sf->inter_sf.disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
     sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
     sf->inter_sf.disable_onesided_comp = 1;
-    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
-    // it with cpi->sf.disable_wedge_search_var_thresh.
     sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
     // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
     // and clean-up the speed feature
@@ -847,10 +987,11 @@
     sf->inter_sf.prune_comp_search_by_single_result = boosted ? 4 : 2;
     sf->inter_sf.selective_ref_frame = 5;
     sf->inter_sf.skip_repeated_ref_mv = 1;
-    sf->inter_sf.skip_repeated_full_newmv = 1;
     sf->inter_sf.reuse_compound_type_decision = 1;
     sf->inter_sf.txfm_rd_gate_level =
         boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
+    sf->inter_sf.enable_fast_wedge_mask_search = 1;
+    sf->inter_sf.inter_mode_txfm_breakout = boosted ? 0 : 2;
 
     sf->interp_sf.adaptive_interp_filter_search = 2;
 
@@ -861,19 +1002,21 @@
     sf->intra_sf.intra_pruning_with_hog = 3;
     sf->intra_sf.prune_palette_search_level = 2;
 
+    sf->tpl_sf.prune_starting_mv = 2;
     sf->tpl_sf.skip_alike_starting_mv = 2;
     sf->tpl_sf.prune_intra_modes = 1;
     sf->tpl_sf.reduce_first_step_size = 6;
     sf->tpl_sf.subpel_force_stop = QUARTER_PEL;
+    sf->tpl_sf.gop_length_decision_method = 1;
 
     sf->tx_sf.adaptive_txb_search_level = boosted ? 2 : 3;
     sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
-    sf->tx_sf.use_intra_txb_hash = 1;
+    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
 
     // TODO(any): Refactor the code related to following winner mode speed
     // features
     sf->winner_mode_sf.enable_winner_mode_for_coeff_opt = 1;
-    // TODO(any): Experiment with this speed feature by enabling for key frames
     sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch =
         frame_is_intra_only(&cpi->common) ? 0 : 1;
     sf->winner_mode_sf.enable_winner_mode_for_use_tx_domain_dist = 1;
@@ -883,30 +1026,34 @@
                       ? 1
                       : 2;
 
-    // TODO(any): evaluate if these lpf features can be moved to speed 2.
     // For screen content, "prune_sgr_based_on_wiener = 2" cause large quality
     // loss.
     sf->lpf_sf.prune_sgr_based_on_wiener = allow_screen_content_tools ? 1 : 2;
-    sf->lpf_sf.disable_loop_restoration_chroma = boosted ? 0 : 1;
-    sf->lpf_sf.reduce_wiener_window_size = !boosted;
     sf->lpf_sf.prune_wiener_based_on_src_var = 2;
   }
 
   if (speed >= 4) {
+    sf->gm_sf.prune_zero_mv_with_sse = 1;
+
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
-    sf->part_sf.simple_motion_search_prune_agg = 2;
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? 0 : 2;
     sf->part_sf.simple_motion_search_reduce_search_steps = 4;
     sf->part_sf.prune_ext_part_using_split_info = 2;
-    sf->part_sf.early_term_after_none_split = 1;
     sf->part_sf.ml_predict_breakout_level = 3;
+    sf->part_sf.prune_rectangular_split_based_on_qidx =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 1;
 
     sf->inter_sf.alt_ref_search_fp = 1;
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 3;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 2;
-    sf->inter_sf.prune_compound_using_neighbors = 2;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 2;
     sf->inter_sf.prune_obmc_prob_thresh = INT_MAX;
+    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
+    sf->inter_sf.prune_nearest_near_mv_using_refmv_weight = boosted ? 0 : 1;
 
     sf->interp_sf.cb_pred_filter_search = 1;
     sf->interp_sf.skip_sharp_interp_filter_search = 1;
@@ -915,26 +1062,20 @@
     sf->intra_sf.intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
     sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
-    sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
-    // TODO(any): Experiment with this speed feature set to 2 for higher quality
-    // presets as well
-    sf->intra_sf.skip_intra_in_interframe = 2;
+    // TODO(any): "intra_y_mode_mask" doesn't help much at speed 4.
+    // sf->intra_sf.intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+    // sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+    sf->intra_sf.skip_intra_in_interframe = 4;
 
     sf->mv_sf.simple_motion_subpel_force_stop = HALF_PEL;
+    sf->mv_sf.prune_mesh_search = 1;
 
-    sf->tpl_sf.prune_starting_mv = 2;
     sf->tpl_sf.subpel_force_stop = HALF_PEL;
     sf->tpl_sf.search_method = FAST_BIGDIA;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 1;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
     sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
-    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 1;
-    // TODO(any): Experiment with enabling of this speed feature as hash state
-    // is reset during winner mode processing
-    sf->tx_sf.use_intra_txb_hash = 0;
 
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 5 : 7;
     sf->rd_sf.tx_domain_dist_thres_level = 2;
@@ -947,16 +1088,19 @@
 
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_FULL_IMAGE_NON_DUAL;
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL3;
-
-    sf->mv_sf.reduce_search_range = 1;
   }
 
   if (speed >= 5) {
-    sf->part_sf.simple_motion_search_prune_agg = 3;
+    sf->fp_sf.reduce_mv_step_param = 4;
+
+    sf->part_sf.simple_motion_search_prune_agg =
+        allow_screen_content_tools ? 0 : 3;
     sf->part_sf.ext_partition_eval_thresh =
         allow_screen_content_tools ? BLOCK_8X8 : BLOCK_16X16;
+    sf->part_sf.prune_sub_8x8_partition_level =
+        (allow_screen_content_tools || frame_is_intra_only(&cpi->common)) ? 0
+                                                                          : 2;
 
-    sf->inter_sf.disable_interinter_wedge_var_thresh = UINT_MAX;
     sf->inter_sf.prune_inter_modes_if_skippable = 1;
     sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 4;
     // Enable fast search for all valid compound modes.
@@ -973,13 +1117,14 @@
         frame_is_intra_only(&cpi->common) ? 0 : 1;
     sf->lpf_sf.disable_lr_filter = 1;
 
-    sf->mv_sf.prune_mesh_search = 1;
-
     sf->tpl_sf.prune_starting_mv = 3;
     sf->tpl_sf.use_y_only_rate_distortion = 1;
     sf->tpl_sf.subpel_force_stop = FULL_PEL;
+    sf->tpl_sf.gop_length_decision_method = 2;
 
     sf->winner_mode_sf.dc_blk_pred_level = 1;
+
+    sf->fp_sf.disable_recon = 1;
   }
 
   if (speed >= 6) {
@@ -988,14 +1133,19 @@
     sf->hl_sf.recode_tolerance = 55;
 
     sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3;
-    sf->inter_sf.prune_nearmv_using_neighbors = 1;
     sf->inter_sf.selective_ref_frame = 6;
+    sf->inter_sf.prune_ext_comp_using_neighbors = 3;
 
     sf->intra_sf.chroma_intra_pruning_with_hog = 4;
     sf->intra_sf.intra_pruning_with_hog = 4;
+    sf->intra_sf.intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC;
+    sf->intra_sf.intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_32X32] = INTRA_DC;
+    sf->intra_sf.intra_y_mode_mask[TX_64X64] = INTRA_DC;
+    sf->intra_sf.early_term_chroma_palette_size_search = 1;
 
     sf->part_sf.prune_rectangular_split_based_on_qidx =
-        boosted || allow_screen_content_tools ? 0 : 1;
+        boosted || allow_screen_content_tools ? 0 : 2;
     sf->part_sf.prune_sub_8x8_partition_level =
         allow_screen_content_tools ? 0
                                    : frame_is_intra_only(&cpi->common) ? 1 : 2;
@@ -1004,32 +1154,26 @@
     sf->mv_sf.simple_motion_subpel_force_stop = FULL_PEL;
     sf->mv_sf.use_bsize_dependent_search_method = 1;
 
-    sf->tpl_sf.disable_gop_length_decision = 1;
+    sf->tpl_sf.gop_length_decision_method = 3;
     sf->tpl_sf.disable_filtered_key_tpl = 1;
 
-    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 2;
-    sf->tx_sf.use_intra_txb_hash = 1;
-    sf->tx_sf.tx_type_search.prune_tx_type_est_rd = 0;
-
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
     sf->rd_sf.perform_coeff_opt = is_boosted_arf2_bwd_type ? 6 : 8;
 
     sf->winner_mode_sf.dc_blk_pred_level = 2;
     sf->winner_mode_sf.multi_winner_mode_type = MULTI_WINNER_MODE_OFF;
 
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
-  }
 
-  // Intra txb hash is currently not compatible with multi-winner mode as the
-  // hashes got reset during multi-winner mode processing.
-  assert(IMPLIES(
-      sf->winner_mode_sf.multi_winner_mode_type != MULTI_WINNER_MODE_OFF,
-      !sf->tx_sf.use_intra_txb_hash));
+    sf->fp_sf.skip_zeromv_motion_search = 1;
+  }
 }
 
 static void set_rt_speed_feature_framesize_dependent(const AV1_COMP *const cpi,
                                                      SPEED_FEATURES *const sf,
                                                      int speed) {
   const AV1_COMMON *const cm = &cpi->common;
+  const int boosted = frame_is_boosted(cpi);
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_360p_or_larger = AOMMIN(cm->width, cm->height) >= 360;
@@ -1038,6 +1182,7 @@
 
   if (!is_360p_or_larger) {
     if (speed >= 6) sf->rt_sf.force_tx_search_off = 1;
+    if (speed >= 7) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     if (speed >= 8) {
       sf->rt_sf.use_modeled_non_rd_cost = 0;
       sf->rt_sf.use_nonrd_filter_search = 0;
@@ -1056,10 +1201,22 @@
 #endif
     }
   } else {
-    if (speed == 8 && !cpi->use_svc) {
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
+    if (speed == 5) {
+      sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh =
+          boosted ? INT_MAX : 350;
+    }
+    if (speed >= 7) {
+      sf->rt_sf.use_comp_ref_nonrd = 1;
+      sf->rt_sf.ref_frame_comp_nonrd[2] = 1;  // LAST_ALTREF
+    }
+    if (speed == 8 && !cpi->ppi->use_svc) {
       sf->rt_sf.short_circuit_low_temp_var = 0;
       sf->rt_sf.use_nonrd_altref_frame = 1;
     }
+    if (speed >= 9) {
+      sf->rt_sf.skip_cdef_sb = 1;
+    }
   }
   if (!is_480p_or_larger) {
     if (speed == 7) {
@@ -1073,6 +1230,22 @@
       sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
       sf->rt_sf.estimate_motion_for_var_based_partition = 0;
     }
+    if (speed >= 10) {
+      sf->rt_sf.use_comp_ref_nonrd = 0;
+    }
+  }
+  if (cpi->ppi->use_svc) {
+    sf->rt_sf.use_comp_ref_nonrd = 0;
+    if (cpi->svc.ref_frame_comp[0] || cpi->svc.ref_frame_comp[1] ||
+        cpi->svc.ref_frame_comp[2]) {
+      sf->rt_sf.use_comp_ref_nonrd = 1;
+      sf->rt_sf.ref_frame_comp_nonrd[0] =
+          cpi->svc.ref_frame_comp[0] && cpi->svc.reference[GOLDEN_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[1] =
+          cpi->svc.ref_frame_comp[1] && cpi->svc.reference[LAST2_FRAME - 1];
+      sf->rt_sf.ref_frame_comp_nonrd[2] =
+          cpi->svc.ref_frame_comp[2] && cpi->svc.reference[ALTREF_FRAME - 1];
+    }
   }
 }
 
@@ -1086,163 +1259,97 @@
   AV1_COMMON *const cm = &cpi->common;
   const int boosted = frame_is_boosted(cpi);
 
-  // Speed 0 for all speed features that give neutral coding performance change.
-  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+  // Currently, rt speed 0, 1, 2, 3, 4 are the same.
+  // TODO(any, yunqing): tune these features for real-time use cases.
+  sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
 
-  sf->part_sf.less_rectangular_check_level = 1;
-  sf->part_sf.ml_prune_partition = 1;
-  sf->part_sf.prune_ext_partition_types_search_level = 1;
+  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
+  sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
 
-  // TODO(debargha): Test, tweak and turn on either 1 or 2
   sf->inter_sf.inter_mode_rd_model_estimation = 0;
-  sf->inter_sf.disable_interintra_wedge_var_thresh = 0;
-  sf->inter_sf.disable_interinter_wedge_var_thresh = 0;
   sf->inter_sf.model_based_post_interp_filter_breakout = 1;
   sf->inter_sf.prune_compound_using_single_ref = 0;
   sf->inter_sf.prune_mode_search_simple_translation = 1;
   sf->inter_sf.prune_ref_frame_for_rect_partitions = !boosted;
   sf->inter_sf.reduce_inter_modes = 1;
-  sf->inter_sf.selective_ref_frame = 1;
-  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+  sf->inter_sf.reuse_inter_intra_mode = 1;
+  sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
+  sf->inter_sf.fast_wedge_sign_estimate = 1;
+  sf->inter_sf.prune_comp_type_by_comp_avg = 2;
+  sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+  sf->inter_sf.adaptive_rd_thresh = 2;
+  sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
+  sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
+  sf->inter_sf.prune_comp_search_by_single_result = 2;
+  sf->inter_sf.selective_ref_frame = 4;
+  // TODO(any): need tuning for RT mode.
+  sf->inter_sf.alt_ref_search_fp = 1;
+  sf->inter_sf.txfm_rd_gate_level = boosted ? 0 : 4;
 
   sf->interp_sf.use_fast_interpolation_filter_search = 1;
+  sf->interp_sf.use_interp_filter = 1;
+  sf->interp_sf.adaptive_interp_filter_search = 1;
+  sf->interp_sf.cb_pred_filter_search = 0;
+  sf->interp_sf.disable_dual_filter = 1;
 
   sf->intra_sf.dv_cost_upd_level = INTERNAL_COST_UPD_OFF;
-  sf->intra_sf.intra_pruning_with_hog = 1;
+  sf->intra_sf.skip_intra_in_interframe = 4;
+
+  sf->lpf_sf.dual_sgr_penalty_level = 1;
+  sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
 
   sf->mv_sf.full_pixel_search_level = 1;
   sf->mv_sf.exhaustive_searches_thresh = INT_MAX;
+  sf->mv_sf.obmc_full_pixel_search_level = 1;
+  sf->mv_sf.auto_mv_step_size = 1;
+  sf->mv_sf.subpel_iters_per_step = 1;
+  sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
+  sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
+
+  sf->part_sf.ml_prune_partition = 1;
+  sf->part_sf.reuse_prev_rd_results_for_part_ab = 1;
+  sf->part_sf.use_best_rd_for_pruning = 1;
+  sf->part_sf.prune_ext_partition_types_search_level = 2;
+  sf->part_sf.less_rectangular_check_level = 2;
+  sf->part_sf.early_term_after_none_split = 1;
+  sf->part_sf.partition_search_breakout_dist_thr = (1 << 25);
+  sf->part_sf.partition_search_breakout_rate_thr = 200;
+
+  sf->rd_sf.tx_domain_dist_thres_level = 1;
+  sf->rd_sf.tx_domain_dist_level = 1;
 
   sf->rt_sf.check_intra_pred_nonrd = 1;
   sf->rt_sf.estimate_motion_for_var_based_partition = 1;
   sf->rt_sf.hybrid_intra_pickmode = 1;
-  sf->rt_sf.nonrd_prune_ref_frame_search = 0;
-  sf->rt_sf.reuse_inter_pred_nonrd = 0;
-  sf->rt_sf.use_comp_ref_nonrd = 1;
+  sf->rt_sf.use_comp_ref_nonrd = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[0] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[1] = 0;
+  sf->rt_sf.ref_frame_comp_nonrd[2] = 0;
   sf->rt_sf.use_nonrd_filter_search = 1;
-  sf->rt_sf.use_nonrd_pick_mode = 0;
-  sf->rt_sf.use_real_time_ref_set = 0;
-  sf->rt_sf.check_scene_detection = 0;
-  sf->rt_sf.overshoot_detection_cbr = NO_DETECTION;
-  sf->tx_sf.adaptive_txb_search_level = 1;
+
   sf->tx_sf.intra_tx_size_search_init_depth_sqr = 1;
-  sf->tx_sf.model_based_prune_tx_search_level = 1;
   sf->tx_sf.tx_type_search.use_reduced_intra_txset = 1;
-  sf->rt_sf.fullpel_search_step_param = 0;
-  sf->rt_sf.skip_loopfilter_non_reference = 0;
+  sf->tx_sf.adaptive_txb_search_level = 2;
+  sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.tx_size_search_lgr_block = 1;
+  sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
+  sf->tx_sf.tx_type_search.skip_tx_search = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
+  sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
+  sf->tx_sf.model_based_prune_tx_search_level = 0;
+  sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
 
-  sf->hl_sf.superres_auto_search_type = SUPERRES_AUTO_SOLO;
-
-  if (speed >= 1) {
-    sf->gm_sf.gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
-
-    sf->part_sf.prune_ext_partition_types_search_level = 2;
-    sf->part_sf.simple_motion_search_prune_rect = 1;
-
-    sf->mv_sf.obmc_full_pixel_search_level = 1;
-    sf->mv_sf.use_accurate_subpel_search = USE_4_TAPS;
-
-    sf->inter_sf.prune_comp_search_by_single_result = 1;
-    sf->inter_sf.reuse_inter_intra_mode = 1;
-    sf->inter_sf.selective_ref_frame = 2;
-    sf->inter_sf.skip_repeated_newmv = 1;
-    sf->inter_sf.disable_interintra_wedge_var_thresh = 0;
-    sf->inter_sf.disable_interinter_wedge_var_thresh = 0;
-    sf->inter_sf.prune_comp_type_by_comp_avg = 1;
-
-    sf->interp_sf.cb_pred_filter_search = 1;
-    sf->interp_sf.use_interp_filter = 1;
-
-    sf->tx_sf.adaptive_txb_search_level = 2;
-    sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
-    sf->tx_sf.tx_size_search_lgr_block = 1;
-    sf->tx_sf.tx_type_search.ml_tx_split_thresh = 4000;
-    sf->tx_sf.tx_type_search.skip_tx_search = 1;
-    sf->tx_sf.use_intra_txb_hash = 1;
-
-    sf->rd_sf.tx_domain_dist_level = boosted ? 0 : 1;
-    sf->rd_sf.tx_domain_dist_thres_level = 1;
-
-    sf->lpf_sf.dual_sgr_penalty_level = 1;
-  }
-
-  if (speed >= 2) {
-    sf->part_sf.allow_partition_search_skip = 1;
-    sf->part_sf.partition_search_breakout_rate_thr = 80;
-
-    sf->mv_sf.auto_mv_step_size = 1;
-    sf->mv_sf.subpel_iters_per_step = 1;
-
-    sf->inter_sf.adaptive_rd_thresh = 1;
-    sf->inter_sf.comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
-    sf->inter_sf.disable_interintra_wedge_var_thresh = 100;
-    sf->inter_sf.disable_interinter_wedge_var_thresh = 100;
-    sf->inter_sf.fast_wedge_sign_estimate = 1;
-    sf->inter_sf.prune_comp_type_by_comp_avg = 2;
-    sf->inter_sf.selective_ref_frame = 3;
-    sf->inter_sf.use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
-
-    sf->interp_sf.adaptive_interp_filter_search = 1;
-    sf->interp_sf.cb_pred_filter_search = 0;
-    sf->interp_sf.disable_dual_filter = 1;
-
-    sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
-    sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
-    sf->tx_sf.model_based_prune_tx_search_level = 0;
-
-    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL1;
-  }
-
-  if (speed >= 3) {
-    sf->hl_sf.recode_loop = ALLOW_RECODE_KFARFGF;
-
-    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
-
-    sf->part_sf.less_rectangular_check_level = 2;
-
-    sf->mv_sf.use_accurate_subpel_search = USE_2_TAPS;
-    // adaptive_motion_search breaks encoder multi-thread tests.
-    // The values in x->pred_mv[] differ for single and multi-thread cases.
-    // See aomedia:1778.
-    // sf->mv_sf.adaptive_motion_search = 1;
-
-    sf->inter_sf.adaptive_rd_thresh = 2;
-    sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
-    // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
-    // it with cpi->sf.disable_wedge_search_var_thresh.
-    sf->inter_sf.disable_interintra_wedge_var_thresh = UINT_MAX;
-    sf->inter_sf.prune_comp_search_by_single_result = 2;
-    sf->inter_sf.selective_ref_frame = 4;
-
-    sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
-
-    sf->rd_sf.tx_domain_dist_level = 1;
-
-    sf->winner_mode_sf.tx_size_search_level = boosted ? 0 : 2;
-  }
-
-  if (speed >= 4) {
-    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
-
-    sf->inter_sf.alt_ref_search_fp = 1;
-
-    sf->interp_sf.skip_sharp_interp_filter_search = 1;
-
-    sf->tx_sf.tx_type_search.fast_inter_tx_type_search = 1;
-    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
-    sf->tx_sf.use_intra_txb_hash = 0;
-
-    sf->rd_sf.use_mb_rd_hash = 0;
-
-    sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
-  }
+  sf->winner_mode_sf.tx_size_search_level = frame_is_intra_only(cm) ? 0 : 2;
 
   if (speed >= 5) {
     sf->inter_sf.adaptive_rd_thresh = 4;
+    sf->inter_sf.prune_inter_modes_if_skippable = 1;
+    sf->inter_sf.prune_nearmv_using_neighbors = PRUNE_NEARMV_LEVEL3;
+    sf->inter_sf.reduce_inter_modes = boosted ? 1 : 3;
+    sf->inter_sf.skip_newmv_in_drl = 4;
 
     sf->rd_sf.tx_domain_dist_level = 2;
     sf->rd_sf.tx_domain_dist_thres_level = 2;
-    sf->winner_mode_sf.tx_size_search_level = 1;
 
     sf->rt_sf.mode_search_skip_flags =
         (cm->current_frame.frame_type == KEY_FRAME)
@@ -1261,7 +1368,6 @@
 
     sf->mv_sf.search_method = FAST_DIAMOND;
     sf->mv_sf.subpel_force_stop = QUARTER_PEL;
-    sf->mv_sf.use_fullpel_costlist = 1;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
@@ -1270,21 +1376,30 @@
       sf->intra_sf.intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_sf.intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
     }
+    sf->intra_sf.skip_intra_in_interframe = 5;
+    sf->intra_sf.disable_smooth_intra = 1;
+    sf->intra_sf.skip_filter_intra_in_inter_frames = 1;
 
     sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_3;
-    sf->tx_sf.use_inter_txb_hash = 0;
     sf->tx_sf.refine_fast_tx_search_results = 0;
+    sf->tx_sf.tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_sf.tx_type_search.use_skip_flag_prediction = 2;
+    sf->tx_sf.tx_type_search.winner_mode_tx_type_pruning = 4;
 
     sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
     sf->rd_sf.simple_model_rd_from_var = 1;
 
     sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL4;
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->lpf_sf.disable_lr_filter = 1;
+
+    sf->winner_mode_sf.dc_blk_pred_level = frame_is_intra_only(cm) ? 0 : 2;
+    sf->winner_mode_sf.enable_winner_mode_for_tx_size_srch = 1;
+    sf->winner_mode_sf.tx_size_search_level = 1;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->rt_sf.num_inter_modes_for_tx_search = 5;
     sf->rt_sf.skip_interp_filter_search = 1;
-    sf->rt_sf.use_comp_ref_nonrd = 0;
     sf->rt_sf.use_real_time_ref_set = 1;
     sf->rt_sf.use_simple_rd_model = 1;
 
@@ -1299,11 +1414,27 @@
     if (cpi->oxcf.kf_cfg.key_freq_max != 0 &&
         cm->width * cm->height > 640 * 480)
       sf->rt_sf.use_temporal_noise_estimate = 1;
+    sf->rt_sf.skip_tx_no_split_var_based_partition = 1;
+
+    // For SVC: use better mv search on base temporal layers, and only
+    // on base spatial layer if highest resolution is above 640x360.
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 6;
+    }
   }
 
   if (speed >= 6) {
-    sf->part_sf.adjust_var_based_rd_partitioning = 1;
-    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
+    sf->mv_sf.use_fullpel_costlist = 1;
+
+    sf->tx_sf.tx_type_search.fast_inter_tx_type_prob_thresh = 0;
+    sf->inter_sf.prune_warped_prob_thresh = 8;
+    sf->inter_sf.extra_prune_warped = 1;
   }
 
   if (speed >= 7) {
@@ -1311,20 +1442,30 @@
     sf->part_sf.default_min_partition_size = BLOCK_8X8;
     sf->part_sf.partition_search_type = VAR_BASED_PARTITION;
 
+    sf->gm_sf.gm_search_type = GM_DISABLE_SEARCH;
+
     sf->mv_sf.search_method = FAST_DIAMOND;
     sf->mv_sf.subpel_force_stop = QUARTER_PEL;
     sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED;
 
     sf->inter_sf.inter_mode_rd_model_estimation = 2;
+    // This sf is not applicable in non-rd path.
+    sf->inter_sf.skip_newmv_in_drl = 0;
+
+    // Disable intra_y_mode_mask pruning since the performance at speed 7 isn't
+    // good. May need more study.
+    for (int i = 0; i < TX_SIZES; ++i) {
+      sf->intra_sf.intra_y_mode_mask[i] = INTRA_ALL;
+    }
 
     sf->lpf_sf.lpf_pick = LPF_PICK_FROM_Q;
+    sf->lpf_sf.cdef_pick_method = CDEF_FAST_SEARCH_LVL5;
 
     sf->rt_sf.mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
     sf->rt_sf.nonrd_prune_ref_frame_search = 1;
     sf->rt_sf.reuse_inter_pred_nonrd = 0;
     sf->rt_sf.short_circuit_low_temp_var = 0;
     sf->rt_sf.skip_interp_filter_search = 0;
-    sf->rt_sf.use_comp_ref_nonrd = 0;
     // For spatial layers, only LAST and GOLDEN are currently used in the SVC
     // for nonrd. The flag use_nonrd_altref_frame can disable GOLDEN in the
     // get_ref_frame_flags() for some patterns, so disable it here for
@@ -1335,26 +1476,26 @@
     sf->rt_sf.nonrd_check_partition_merge_mode = 1;
     sf->rt_sf.nonrd_check_partition_split = 0;
     sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
-    // For SVC: use better mv search on base temporal layer, and only
+    // For SVC: use better mv search on base temporal layers, and only
     // on base spatial layer if highest resolution is above 640x360.
-    if (cpi->svc.number_temporal_layers > 1) {
-      if (cpi->svc.temporal_layer_id == 0 &&
-          (cpi->svc.spatial_layer_id == 0 ||
-           cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
-               640 * 360)) {
-        sf->mv_sf.search_method = NSTEP;
-        sf->mv_sf.subpel_search_method = SUBPEL_TREE;
-        sf->rt_sf.fullpel_search_step_param = 6;
-      } else if (cpi->svc.non_reference_frame) {
-        sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
-        sf->rt_sf.fullpel_search_step_param = 10;
-      }
+    if (cpi->svc.number_temporal_layers > 1 &&
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.frm_dim_cfg.width * cpi->oxcf.frm_dim_cfg.height <=
+             640 * 360)) {
+      sf->mv_sf.search_method = NSTEP;
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE;
+      sf->rt_sf.fullpel_search_step_param = 6;
+    } else if (cpi->svc.non_reference_frame) {
+      sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+      sf->rt_sf.fullpel_search_step_param = 10;
     }
     // TODO(marpan): Look into why enabling skip_loopfilter_non_reference is
     // not bitexact on rtc testset, its very close (< ~0.01 bdrate), but not
     // always bitexact.
-    if (cpi->use_svc && cpi->svc.non_reference_frame &&
-        sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q &&
+    if (cpi->ppi->use_svc && cpi->svc.non_reference_frame &&
+        (sf->lpf_sf.cdef_pick_method == CDEF_PICK_FROM_Q ||
+         sf->lpf_sf.cdef_pick_method == CDEF_FAST_SEARCH_LVL5) &&
         sf->lpf_sf.lpf_pick == LPF_PICK_FROM_Q)
       sf->rt_sf.skip_loopfilter_non_reference = 1;
     // Set mask for intra modes.
@@ -1364,6 +1505,8 @@
       else
         // Use DC, H, V intra mode for block sizes < 32X32.
         sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC_H_V;
+
+    sf->winner_mode_sf.dc_blk_pred_level = 0;
   }
 
   if (speed >= 8) {
@@ -1372,8 +1515,13 @@
     sf->rt_sf.estimate_motion_for_var_based_partition = 1;
     sf->rt_sf.short_circuit_low_temp_var = 1;
 #if !CONFIG_AV1_TEMPORAL_DENOISING
+#if !CONFIG_REALTIME_ONLY
+    sf->rt_sf.reuse_inter_pred_nonrd =
+        (cpi->oxcf.motion_mode_cfg.enable_warped_motion == 0);
+#else
     sf->rt_sf.reuse_inter_pred_nonrd = 1;
 #endif
+#endif
     sf->rt_sf.use_nonrd_altref_frame = 0;
     sf->rt_sf.nonrd_prune_ref_frame_search = 2;
     sf->rt_sf.nonrd_check_partition_merge_mode = 0;
@@ -1384,11 +1532,19 @@
     sf->interp_sf.cb_pred_filter_search = 1;
   }
   if (speed >= 9) {
+    sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q;
     sf->rt_sf.estimate_motion_for_var_based_partition = 0;
     sf->rt_sf.force_large_partition_blocks = 1;
     for (int i = 0; i < BLOCK_SIZES; ++i)
       sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC;
   }
+  if (speed >= 10) {
+    sf->rt_sf.source_metrics_sb_nonrd = 0;
+    sf->rt_sf.skip_intra_pred_if_tx_skip = 1;
+    sf->rt_sf.nonrd_agressive_skip = 1;
+    sf->rt_sf.nonrd_prune_ref_frame_search = 3;
+    sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+  }
 }
 
 static AOM_INLINE void init_hl_sf(HIGH_LEVEL_SPEED_FEATURES *hl_sf) {
@@ -1403,8 +1559,15 @@
   hl_sf->second_alt_ref_filtering = 1;
 }
 
+static AOM_INLINE void init_fp_sf(FIRST_PASS_SPEED_FEATURES *fp_sf) {
+  fp_sf->reduce_mv_step_param = 3;
+  fp_sf->skip_motion_search_threshold = 0;
+  fp_sf->disable_recon = 0;
+  fp_sf->skip_zeromv_motion_search = 0;
+}
+
 static AOM_INLINE void init_tpl_sf(TPL_SPEED_FEATURES *tpl_sf) {
-  tpl_sf->disable_gop_length_decision = 0;
+  tpl_sf->gop_length_decision_method = 0;
   tpl_sf->prune_intra_modes = 0;
   tpl_sf->prune_starting_mv = 0;
   tpl_sf->reduce_first_step_size = 0;
@@ -1420,6 +1583,7 @@
 static AOM_INLINE void init_gm_sf(GLOBAL_MOTION_SPEED_FEATURES *gm_sf) {
   gm_sf->gm_search_type = GM_FULL_SEARCH;
   gm_sf->prune_ref_frame_for_gm_search = 0;
+  gm_sf->prune_zero_mv_with_sse = 0;
 }
 
 static AOM_INLINE void init_part_sf(PARTITION_SPEED_FEATURES *part_sf) {
@@ -1430,7 +1594,6 @@
   part_sf->default_max_partition_size = BLOCK_LARGEST;
   part_sf->default_min_partition_size = BLOCK_4X4;
   part_sf->adjust_var_based_rd_partitioning = 0;
-  part_sf->allow_partition_search_skip = 0;
   part_sf->max_intra_bsize = BLOCK_LARGEST;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -1451,15 +1614,19 @@
   part_sf->simple_motion_search_prune_rect = 0;
   part_sf->simple_motion_search_early_term_none = 0;
   part_sf->simple_motion_search_reduce_search_steps = 0;
-  part_sf->intra_cnn_split = 0;
+  part_sf->intra_cnn_based_part_prune_level = 0;
   part_sf->ext_partition_eval_thresh = BLOCK_8X8;
+  part_sf->rect_partition_eval_thresh = BLOCK_128X128;
   part_sf->prune_ext_part_using_split_info = 0;
   part_sf->prune_rectangular_split_based_on_qidx = 0;
   part_sf->early_term_after_none_split = 0;
   part_sf->ml_predict_breakout_level = 0;
   part_sf->prune_sub_8x8_partition_level = 0;
   part_sf->simple_motion_search_rect_split = 0;
+  part_sf->reuse_prev_rd_results_for_part_ab = 0;
   part_sf->reuse_best_prediction_for_part_ab = 0;
+  part_sf->use_best_rd_for_pruning = 0;
+  part_sf->skip_non_sq_part_based_on_none = 0;
 }
 
 static AOM_INLINE void init_mv_sf(MV_SPEED_FEATURES *mv_sf) {
@@ -1483,7 +1650,6 @@
 }
 
 static AOM_INLINE void init_inter_sf(INTER_MODE_SPEED_FEATURES *inter_sf) {
-  inter_sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   inter_sf->adaptive_rd_thresh = 0;
   inter_sf->model_based_post_interp_filter_breakout = 0;
   inter_sf->reduce_inter_modes = 0;
@@ -1494,16 +1660,19 @@
   inter_sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
   inter_sf->reuse_inter_intra_mode = 0;
   inter_sf->mv_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->coeff_cost_upd_level = INTERNAL_COST_UPD_SB;
+  inter_sf->mode_cost_upd_level = INTERNAL_COST_UPD_SB;
   inter_sf->prune_inter_modes_based_on_tpl = 0;
-  inter_sf->prune_nearmv_using_neighbors = 0;
+  inter_sf->prune_nearmv_using_neighbors = PRUNE_NEARMV_OFF;
   inter_sf->prune_comp_search_by_single_result = 0;
   inter_sf->skip_repeated_ref_mv = 0;
-  inter_sf->skip_repeated_newmv = 0;
-  inter_sf->skip_repeated_full_newmv = 0;
+  inter_sf->skip_newmv_in_drl = 0;
   inter_sf->inter_mode_rd_model_estimation = 0;
   inter_sf->prune_compound_using_single_ref = 0;
-  inter_sf->prune_compound_using_neighbors = 0;
+  inter_sf->prune_ext_comp_using_neighbors = 0;
+  inter_sf->skip_ext_comp_nearmv_mode = 0;
   inter_sf->prune_comp_using_best_single_mode_ref = 0;
+  inter_sf->prune_nearest_near_mv_using_refmv_weight = 0;
   inter_sf->disable_onesided_comp = 0;
   inter_sf->prune_mode_search_simple_translation = 0;
   inter_sf->prune_comp_type_by_comp_avg = 0;
@@ -1522,6 +1691,10 @@
   inter_sf->disable_masked_comp = 0;
   inter_sf->enable_fast_compound_mode_search = 0;
   inter_sf->reuse_mask_search_results = 0;
+  inter_sf->enable_fast_wedge_mask_search = 0;
+  inter_sf->inter_mode_txfm_breakout = 0;
+  inter_sf->limit_inter_mode_cands = 0;
+  inter_sf->limit_txfm_eval_per_mode = 0;
 }
 
 static AOM_INLINE void init_interp_sf(INTERP_FILTER_SPEED_FEATURES *interp_sf) {
@@ -1539,15 +1712,19 @@
   intra_sf->intra_pruning_with_hog = 0;
   intra_sf->chroma_intra_pruning_with_hog = 0;
   intra_sf->prune_palette_search_level = 0;
+  intra_sf->prune_luma_palette_size_search_level = 0;
 
   for (int i = 0; i < TX_SIZES; i++) {
     intra_sf->intra_y_mode_mask[i] = INTRA_ALL;
     intra_sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
   }
   intra_sf->disable_smooth_intra = 0;
-  intra_sf->disable_filter_intra = 0;
+  intra_sf->prune_filter_intra_level = 0;
   intra_sf->prune_chroma_modes_using_luma_winner = 0;
   intra_sf->cfl_search_range = 3;
+  intra_sf->top_intra_model_count_allowed = TOP_INTRA_MODEL_COUNT;
+  intra_sf->early_term_chroma_palette_size_search = 0;
+  intra_sf->skip_filter_intra_in_inter_frames = 0;
 }
 
 static AOM_INLINE void init_tx_sf(TX_SPEED_FEATURES *tx_sf) {
@@ -1562,15 +1739,13 @@
   tx_sf->tx_type_search.use_skip_flag_prediction = 1;
   tx_sf->tx_type_search.use_reduced_intra_txset = 0;
   tx_sf->tx_type_search.fast_intra_tx_type_search = 0;
-  tx_sf->tx_type_search.fast_inter_tx_type_search = 0;
+  tx_sf->tx_type_search.fast_inter_tx_type_prob_thresh = INT_MAX;
   tx_sf->tx_type_search.skip_tx_search = 0;
   tx_sf->tx_type_search.prune_tx_type_using_stats = 0;
   tx_sf->tx_type_search.prune_tx_type_est_rd = 0;
   tx_sf->tx_type_search.winner_mode_tx_type_pruning = 0;
   tx_sf->txb_split_cap = 1;
   tx_sf->adaptive_txb_search_level = 0;
-  tx_sf->use_intra_txb_hash = 0;
-  tx_sf->use_inter_txb_hash = 1;
   tx_sf->refine_fast_tx_search_results = 1;
   tx_sf->prune_tx_size_level = 0;
 }
@@ -1597,7 +1772,7 @@
   } else {
     assert(0 && "Invalid disable_trellis_quant value");
   }
-  rd_sf->use_mb_rd_hash = 1;
+  rd_sf->use_mb_rd_hash = 0;
   rd_sf->simple_model_rd_from_var = 0;
   rd_sf->tx_domain_dist_level = 0;
   rd_sf->tx_domain_dist_thres_level = 0;
@@ -1608,7 +1783,7 @@
     WINNER_MODE_SPEED_FEATURES *winner_mode_sf) {
   winner_mode_sf->motion_mode_for_winner_cand = 0;
   // Set this at the appropriate speed levels
-  winner_mode_sf->tx_size_search_level = USE_FULL_RD;
+  winner_mode_sf->tx_size_search_level = 0;
   winner_mode_sf->enable_winner_mode_for_coeff_opt = 0;
   winner_mode_sf->enable_winner_mode_for_tx_size_srch = 0;
   winner_mode_sf->enable_winner_mode_for_use_tx_domain_dist = 0;
@@ -1618,6 +1793,7 @@
 
 static AOM_INLINE void init_lpf_sf(LOOP_FILTER_SPEED_FEATURES *lpf_sf) {
   lpf_sf->disable_loop_restoration_chroma = 0;
+  lpf_sf->disable_loop_restoration_luma = 0;
   lpf_sf->prune_wiener_based_on_src_var = 0;
   lpf_sf->prune_sgr_based_on_wiener = 0;
   lpf_sf->enable_sgr_ep_pruning = 0;
@@ -1631,14 +1807,39 @@
 }
 
 static AOM_INLINE void init_rt_sf(REAL_TIME_SPEED_FEATURES *rt_sf) {
-  rt_sf->mode_search_skip_flags = 0;
-  rt_sf->skip_interp_filter_search = 0;
-  rt_sf->force_tx_search_off = 0;
-  rt_sf->num_inter_modes_for_tx_search = INT_MAX;
-  rt_sf->use_simple_rd_model = 0;
+  rt_sf->check_intra_pred_nonrd = 0;
+  rt_sf->skip_intra_pred_if_tx_skip = 0;
+  rt_sf->estimate_motion_for_var_based_partition = 0;
   rt_sf->nonrd_check_partition_merge_mode = 0;
   rt_sf->nonrd_check_partition_split = 0;
-  rt_sf->skip_intra_pred_if_tx_skip = 0;
+  rt_sf->mode_search_skip_flags = 0;
+  rt_sf->nonrd_prune_ref_frame_search = 0;
+  rt_sf->use_nonrd_pick_mode = 0;
+  rt_sf->use_nonrd_altref_frame = 0;
+  rt_sf->use_comp_ref_nonrd = 0;
+  rt_sf->use_real_time_ref_set = 0;
+  rt_sf->short_circuit_low_temp_var = 0;
+  rt_sf->use_modeled_non_rd_cost = 0;
+  rt_sf->reuse_inter_pred_nonrd = 0;
+  rt_sf->num_inter_modes_for_tx_search = INT_MAX;
+  rt_sf->force_tx_search_off = 0;
+  rt_sf->use_nonrd_filter_search = 0;
+  rt_sf->use_simple_rd_model = 0;
+  rt_sf->skip_interp_filter_search = 0;
+  rt_sf->hybrid_intra_pickmode = 0;
+  rt_sf->source_metrics_sb_nonrd = 0;
+  rt_sf->overshoot_detection_cbr = NO_DETECTION;
+  rt_sf->check_scene_detection = 0;
+  rt_sf->force_large_partition_blocks = 0;
+  rt_sf->use_temporal_noise_estimate = 0;
+  rt_sf->fullpel_search_step_param = 0;
+  rt_sf->skip_loopfilter_non_reference = 0;
+  for (int i = 0; i < BLOCK_SIZES; ++i)
+    rt_sf->intra_y_mode_bsize_mask_nrd[i] = INTRA_ALL;
+  rt_sf->nonrd_agressive_skip = 0;
+  rt_sf->skip_cdef_sb = 0;
+  rt_sf->force_large_partition_blocks_intra = 0;
+  rt_sf->skip_tx_no_split_var_based_partition = 0;
 }
 
 void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
@@ -1658,8 +1859,10 @@
   }
 
   if (!cpi->ppi->seq_params_locked) {
-    cpi->common.seq_params.enable_masked_compound &=
+    cpi->common.seq_params->enable_masked_compound &=
         !sf->inter_sf.disable_masked_comp;
+    cpi->common.seq_params->enable_interintra_compound &=
+        (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
   // This is only used in motion vector unit test.
@@ -1668,7 +1871,7 @@
   else if (cpi->oxcf.unit_test_cfg.motion_vector_unit_test == 2)
     cpi->mv_search_params.find_fractional_mv_step = av1_return_min_sub_pixel_mv;
 
-  if ((cpi->oxcf.row_mt == 1) && (cpi->oxcf.max_threads > 1)) {
+  if ((cpi->oxcf.row_mt == 1) && (cpi->mt_info.num_workers > 1)) {
     if (sf->inter_sf.mv_cost_upd_level < INTERNAL_COST_UPD_SBROW) {
       // Set mv_cost_upd_level to use row level update.
       sf->inter_sf.mv_cost_upd_level = INTERNAL_COST_UPD_SBROW;
@@ -1683,6 +1886,7 @@
   int i;
 
   init_hl_sf(&sf->hl_sf);
+  init_fp_sf(&sf->fp_sf);
   init_tpl_sf(&sf->tpl_sf);
   init_gm_sf(&sf->gm_sf);
   init_part_sf(&sf->part_sf);
@@ -1708,12 +1912,18 @@
       break;
   }
 
-  if (!cpi->ppi->seq_params_locked) {
-    cpi->common.seq_params.enable_dual_filter &=
-        !sf->interp_sf.disable_dual_filter;
-    cpi->common.seq_params.enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+  if (!oxcf->txfm_cfg.enable_tx_size_search) {
+    sf->winner_mode_sf.tx_size_search_level = 3;
+  }
 
-    cpi->common.seq_params.enable_interintra_compound &=
+  if (!cpi->ppi->seq_params_locked) {
+    cpi->common.seq_params->order_hint_info.enable_dist_wtd_comp &=
+        (sf->inter_sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+    cpi->common.seq_params->enable_dual_filter &=
+        !sf->interp_sf.disable_dual_filter;
+    cpi->common.seq_params->enable_restoration &= !sf->lpf_sf.disable_lr_filter;
+
+    cpi->common.seq_params->enable_interintra_compound &=
         (sf->inter_sf.disable_interintra_wedge_var_thresh != UINT_MAX);
   }
 
@@ -1748,7 +1958,7 @@
     sf->rd_sf.optimize_coefficients = NO_TRELLIS_OPT;
 
   // No recode or trellis for 1 pass.
-  if (oxcf->pass == 0 && has_no_stats_stage(cpi))
+  if (oxcf->pass == AOM_RC_ONE_PASS && has_no_stats_stage(cpi))
     sf->hl_sf.recode_loop = DISALLOW_RECODE;
 
   MotionVectorSearchParams *const mv_search_params = &cpi->mv_search_params;
@@ -1806,7 +2016,7 @@
          predict_dc_levels[cpi->sf.winner_mode_sf.dc_blk_pred_level],
          sizeof(winner_mode_params->predict_dc_level));
 
-  if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
+  if (cpi->oxcf.row_mt == 1 && (cpi->mt_info.num_workers > 1)) {
     if (sf->inter_sf.inter_mode_rd_model_estimation == 1) {
       // Revert to type 2
       sf->inter_sf.inter_mode_rd_model_estimation = 2;
@@ -1816,7 +2026,7 @@
     // better parallelism when number of threads available are greater than or
     // equal to maximum number of reference frames allowed for global motion.
     if (sf->gm_sf.gm_search_type != GM_DISABLE_SEARCH &&
-        (cpi->oxcf.max_threads >=
+        (cpi->mt_info.num_workers >=
          gm_available_reference_frames[sf->gm_sf.gm_search_type]))
       sf->gm_sf.prune_ref_frame_for_gm_search = 0;
   }
@@ -1828,18 +2038,27 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   WinnerModeParams *const winner_mode_params = &cpi->winner_mode_params;
   const int boosted = frame_is_boosted(cpi);
+  const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
   const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
   const int is_1080p_or_larger = AOMMIN(cm->width, cm->height) >= 1080;
   const int is_arf2_bwd_type =
-      cpi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
+      cpi->ppi->gf_group.update_type[cpi->gf_frame_index] == INTNL_ARF_UPDATE;
 
-  if (cpi->oxcf.mode == REALTIME) return;
+  if (cpi->oxcf.mode == REALTIME) {
+    if (speed >= 6) {
+      const int qindex_thresh = boosted ? 190 : (is_720p_or_larger ? 120 : 150);
+      sf->part_sf.adjust_var_based_rd_partitioning =
+          frame_is_intra_only(cm)
+              ? 0
+              : cm->quant_params.base_qindex > qindex_thresh;
+    }
+    return;
+  }
 
   if (speed == 0) {
     // qindex_thresh for resolution < 720p
     const int qindex_thresh = boosted ? 70 : (is_arf2_bwd_type ? 110 : 140);
     if (!is_720p_or_larger && cm->quant_params.base_qindex <= qindex_thresh) {
-      sf->inter_sf.skip_repeated_newmv = 1;
       sf->part_sf.simple_motion_search_split =
           cm->features.allow_screen_content_tools ? 1 : 2;
       sf->part_sf.simple_motion_search_early_term_none = 1;
@@ -1856,7 +2075,6 @@
       sf->tx_sf.inter_tx_size_search_init_depth_rect = 1;
       sf->tx_sf.inter_tx_size_search_init_depth_sqr = 1;
       sf->tx_sf.intra_tx_size_search_init_depth_rect = 1;
-      sf->inter_sf.skip_repeated_newmv = 1;
       sf->tx_sf.model_based_prune_tx_search_level = 0;
 
       if (is_1080p_or_larger && cm->quant_params.base_qindex <= 108) {
@@ -1868,34 +2086,42 @@
         sf->interp_sf.cb_pred_filter_search = 0;
         sf->tx_sf.tx_type_search.prune_2d_txfm_mode = TX_TYPE_PRUNE_2;
         sf->tx_sf.tx_type_search.skip_tx_search = 1;
-        sf->tx_sf.use_intra_txb_hash = 1;
       }
     }
   }
 
-  if (speed >= 3) {
+  if (speed >= 2) {
     // Disable extended partitions for lower quantizers
-    const int qindex_thresh =
-        cm->features.allow_screen_content_tools ? 50 : 100;
-    if (cm->quant_params.base_qindex <= qindex_thresh && !boosted) {
+    const int aggr = AOMMIN(3, speed - 2);
+    const int qindex_thresh1[4] = { 50, 50, 80, 100 };
+    const int qindex_thresh2[4] = { 80, 100, 120, 160 };
+    int qindex_thresh;
+    int disable_ext_part;
+    if (aggr <= 1) {
+      const int qthresh2 =
+          (!aggr && !is_480p_or_larger) ? 70 : qindex_thresh2[aggr];
+      qindex_thresh = cm->features.allow_screen_content_tools
+                          ? qindex_thresh1[aggr]
+                          : qthresh2;
+      disable_ext_part = !boosted;
+    } else {
+      qindex_thresh = boosted ? qindex_thresh1[aggr] : qindex_thresh2[aggr];
+      disable_ext_part = !frame_is_intra_only(cm);
+    }
+    if (cm->quant_params.base_qindex <= qindex_thresh && disable_ext_part) {
       sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
     }
   }
 
   if (speed >= 4) {
-    // Disable extended partitions for lower quantizers
-    const int qindex_thresh = boosted ? 80 : 120;
-    if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !frame_is_intra_only(&cpi->common)) {
-      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
-    }
-  }
-
-  if (speed >= 5) {
-    const int qindex_thresh = boosted ? 100 : 160;
-    if (cm->quant_params.base_qindex <= qindex_thresh &&
-        !frame_is_intra_only(&cpi->common)) {
-      sf->part_sf.ext_partition_eval_thresh = BLOCK_128X128;
+    // Disable rectangular partitions for lower quantizers
+    const int aggr = AOMMIN(1, speed - 4);
+    const int qindex_thresh[2] = { 65, 80 };
+    int disable_rect_part;
+    disable_rect_part = !boosted;
+    if (cm->quant_params.base_qindex <= qindex_thresh[aggr] &&
+        disable_rect_part && is_480p_or_larger) {
+      sf->part_sf.rect_partition_eval_thresh = BLOCK_8X8;
     }
   }
 
@@ -1917,4 +2143,17 @@
       }
     }
   }
+
+  if (speed >= 4) {
+    // Disable LR search at low and high quantizers and enable only for
+    // mid-quantizer range.
+    if (!boosted && !is_arf2_bwd_type) {
+      const int qindex_low[2] = { 100, 60 };
+      const int qindex_high[2] = { 180, 160 };
+      if (cm->quant_params.base_qindex <= qindex_low[is_720p_or_larger] ||
+          cm->quant_params.base_qindex > qindex_high[is_720p_or_larger]) {
+        sf->lpf_sf.disable_loop_restoration_luma = 1;
+      }
+    }
+  }
 }

diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index 52b8438..954c512 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h

@@ -160,6 +160,7 @@
   CDEF_FAST_SEARCH_LVL3, /**< Search reduced subset of secondary filters than
                               Level 2. */
   CDEF_FAST_SEARCH_LVL4, /**< Search reduced subset of filters than Level 3. */
+  CDEF_FAST_SEARCH_LVL5, /**< Search reduced subset of filters than Level 4. */
   CDEF_PICK_FROM_Q,      /**< Estimate filter strength based on quantizer. */
   CDEF_PICK_METHODS
 } CDEF_PICK_METHOD;
@@ -221,10 +222,24 @@
   MULTI_WINNER_MODE_DEFAULT = 2,
 } UENUM1BYTE(MULTI_WINNER_MODE_TYPE);
 
+enum {
+  PRUNE_NEARMV_OFF = 0,     // Turn off nearmv pruning
+  PRUNE_NEARMV_LEVEL1 = 1,  // Prune nearmv for qindex (0-85)
+  PRUNE_NEARMV_LEVEL2 = 2,  // Prune nearmv for qindex (0-170)
+  PRUNE_NEARMV_LEVEL3 = 3,  // Prune nearmv more aggressively for qindex (0-170)
+  PRUNE_NEARMV_MAX = PRUNE_NEARMV_LEVEL3,
+} UENUM1BYTE(PRUNE_NEARMV_LEVEL);
+
 typedef struct {
   TX_TYPE_PRUNE_MODE prune_2d_txfm_mode;
   int fast_intra_tx_type_search;
-  int fast_inter_tx_type_search;
+
+  // INT_MAX: Disable fast search.
+  // 1 - 1024: Probability threshold used for conditionally forcing tx type,
+  // during mode search.
+  // 0: Force tx type to be DCT_DCT unconditionally, during
+  // mode search.
+  int fast_inter_tx_type_prob_thresh;
 
   // Prune less likely chosen transforms for each intra mode. The speed
   // feature ranges from 0 to 2, for different speed / compression trade offs.
@@ -308,11 +323,9 @@
  * \brief Sequence/frame level speed vs quality features
  */
 typedef struct HIGH_LEVEL_SPEED_FEATURES {
-  /*!\cond */
-  // Frame level coding parameter update
+  /*! Frame level coding parameter update. */
   int frame_parameter_update;
 
-  /*!\endcond */
   /*!
    * Cases and frame types for which the recode loop is enabled.
    */
@@ -324,25 +337,27 @@
    */
   int recode_tolerance;
 
-  /*!\cond */
-  // Determine how motion vector precision is chosen. The possibilities are:
-  // LAST_MV_DATA: use the mv data from the last coded frame
-  // CURRENT_Q: use the current q as a threshold
-  // QTR_ONLY: use quarter pel precision only.
+  /*!
+   * Determine how motion vector precision is chosen. The possibilities are:
+   * LAST_MV_DATA: use the mv data from the last coded frame
+   * CURRENT_Q: use the current q as a threshold
+   * QTR_ONLY: use quarter pel precision only.
+   */
   MV_PREC_LOGIC high_precision_mv_usage;
 
-  // Always set to 0. If on it enables 0 cost background transmission
-  // (except for the initial transmission of the segmentation). The feature is
-  // disabled because the addition of very large block sizes make the
-  // backgrounds very to cheap to encode, and the segmentation we have
-  // adds overhead.
+  /*!
+   * Always set to 0. If on it enables 0 cost background transmission
+   * (except for the initial transmission of the segmentation). The feature is
+   * disabled because the addition of very large block sizes make the
+   * backgrounds very to cheap to encode, and the segmentation we have
+   * adds overhead.
+   */
   int static_segmentation;
 
   /*!
    * Superres-auto mode search type:
    */
   SUPERRES_AUTO_SEARCH_TYPE superres_auto_search_type;
-  /*!\endcond */
 
   /*!
    * Enable/disable extra screen content test by encoding key frame twice.
@@ -355,10 +370,44 @@
   int second_alt_ref_filtering;
 } HIGH_LEVEL_SPEED_FEATURES;
 
+/*!
+ * Speed features for the first pass.
+ */
+typedef struct FIRST_PASS_SPEED_FEATURES {
+  /*!
+   * \brief Reduces the mv search window.
+   * By default, the initial search window is around
+   * MIN(MIN(dims), MAX_FULL_PEL_VAL) = MIN(MIN(dims), 1023).
+   * Each step reduction decrease the window size by about a factor of 2.
+   */
+  int reduce_mv_step_param;
+
+  /*!
+   * \brief Skips the motion search when the zero mv has small sse.
+   */
+  int skip_motion_search_threshold;
+
+  /*!
+   * \brief Skips reconstruction by using source buffers for prediction
+   */
+  int disable_recon;
+
+  /*!
+   * \brief Skips the motion search centered on 0,0 mv.
+   */
+  int skip_zeromv_motion_search;
+} FIRST_PASS_SPEED_FEATURES;
+
 /*!\cond */
 typedef struct TPL_SPEED_FEATURES {
-  // Enable/disable GOP length adaptive decision.
-  int disable_gop_length_decision;
+  // GOP length adaptive decision.
+  // If set to 0, tpl model decides whether a shorter gf interval is better.
+  // If set to 1, tpl stats of ARFs from base layer, (base+1) layer and
+  // (base+2) layer decide whether a shorter gf interval is better.
+  // If set to 2, tpl stats of ARFs from base layer, (base+1) layer and GF boost
+  // decide whether a shorter gf interval is better.
+  // If set to 3, gop length adaptive decision is disabled.
+  int gop_length_decision_method;
   // Prune the intra modes search by tpl.
   // If set to 0, we will search all intra modes from DC_PRED to PAETH_PRED.
   // If set to 1, we only search DC_PRED, V_PRED, and H_PRED.
@@ -402,6 +451,10 @@
   // given direction(past/future), if the evaluated ref_frame in that direction
   // yields gm_type as INVALID/TRANSLATION/IDENTITY
   int prune_ref_frame_for_gm_search;
+
+  // When the current GM type is set to ZEROMV, prune ZEROMV if its performance
+  // is worse than NEWMV under SSE metric.
+  int prune_zero_mv_with_sse;
 } GLOBAL_MOTION_SPEED_FEATURES;
 
 typedef struct PARTITION_SPEED_FEATURES {
@@ -447,8 +500,9 @@
 
   // Sets level of adjustment of variance-based partitioning during
   // rd_use_partition 0 - no partition adjustment, 1 - try to merge partitions
-  // for small blocks and high QP, 2 - always try to merge leaf partitions, 3 -
-  // try to merge and split leaf partitions
+  // for small blocks and high QP, 2 - try to merge partitions, 3 - always try
+  // to merge leaf partitions for small blocks, 4 - try to merge and split leaf
+  // partitions and 0 - 4 decreasing aggressiveness in order.
   int adjust_var_based_rd_partitioning;
 
   // Partition search early breakout thresholds.
@@ -458,9 +512,6 @@
   // Thresholds for ML based partition search breakout.
   int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
 
-  // Allow skipping partition search for still image frame
-  int allow_partition_search_skip;
-
   // The aggressiveness of pruning with simple_motion_search.
   // Currently 0 is the lowest, and 2 the highest.
   int simple_motion_search_prune_agg;
@@ -492,12 +543,18 @@
   BLOCK_SIZE max_intra_bsize;
 
   // Use CNN with luma pixels on source frame on each of the 64x64 subblock to
-  // perform split/no_split decision on intra-frames.
-  int intra_cnn_split;
+  // perform partition pruning in intra frames.
+  // 0: No Pruning
+  // 1: Prune split and rectangular partitions only
+  // 2: Prune none, split and rectangular partitions
+  int intra_cnn_based_part_prune_level;
 
   // Disable extended partition search for lower block sizes.
   int ext_partition_eval_thresh;
 
+  // Disable rectangular partitions for larger block sizes.
+  int rect_partition_eval_thresh;
+
   // prune extended partition search
   // 0 : no pruning
   // 1 : prune 1:4 partition search using winner info from split partitions
@@ -505,6 +562,9 @@
   int prune_ext_part_using_split_info;
 
   // Prunt rectangular, AB and 4-way partition based on q index and block size
+  // 0 : no pruning
+  // 1 : prune sub_8x8 at very low quantizers
+  // 2 : prune all block size based on qindex
   int prune_rectangular_split_based_on_qidx;
 
   // Terminate partition search for child partition,
@@ -527,9 +587,60 @@
   // 0: disable pruning, 1: enable pruning
   int simple_motion_search_rect_split;
 
+  // The current encoder adopts a DFS search for block partitions.
+  // Therefore the mode selection and associated rdcost is ready for smaller
+  // blocks before the mode selection for some partition types.
+  // AB partition could use previous rd information and skip mode search.
+  // An example is:
+  //
+  //  current block
+  //  +---+---+
+  //  |       |
+  //  +       +
+  //  |       |
+  //  +-------+
+  //
+  //  SPLIT partition has been searched first before trying HORZ_A
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //  | R | R |
+  //  +---+---+
+  //
+  //  HORZ_A
+  //  +---+---+
+  //  |   |   |
+  //  +---+---+
+  //  |       |
+  //  +-------+
+  //
+  //  With this speed feature, the top two sub blocks can directly use rdcost
+  //  searched in split partition, and the mode info is also copied from
+  //  saved info. Similarly, the bottom rectangular block can also use
+  //  the available information from previous rectangular search.
+  int reuse_prev_rd_results_for_part_ab;
+
   // Reuse the best prediction modes found in PARTITION_SPLIT and PARTITION_RECT
   // when encoding PARTITION_AB.
   int reuse_best_prediction_for_part_ab;
+
+  // The current partition search records the best rdcost so far and uses it
+  // in mode search and transform search to early skip when some criteria is
+  // met. For example, when the current rdcost is larger than the best rdcost,
+  // or the model rdcost is larger than the best rdcost times some thresholds.
+  // By default, this feature is turned on to speed up the encoder partition
+  // search.
+  // If disabling it, at speed 0, 30 frames, we could get
+  // about -0.25% quality gain (psnr, ssim, vmaf), with about 13% slowdown.
+  int use_best_rd_for_pruning;
+
+  // Skip evaluation of non-square partitions based on the corresponding NONE
+  // partition.
+  // 0: no pruning
+  // 1: prune extended partitions if NONE is skippable
+  // 2: on top of 1, prune rectangular partitions if NONE is inter, not a newmv
+  // mode and skippable
+  int skip_non_sq_part_based_on_none;
 } PARTITION_SPEED_FEATURES;
 
 typedef struct MV_SPEED_FEATURES {
@@ -640,16 +751,19 @@
 
   int alt_ref_search_fp;
 
-  // flag to skip NEWMV mode in drl if the motion search result is the same
-  int skip_repeated_newmv;
-
-  // Skip the current ref_mv in NEW_MV mode if we have already encountered
-  // another ref_mv in the drl such that:
-  //  1. The other drl has the same fullpel_mv during the SIMPLE_TRANSLATION
-  //     search process as the current fullpel_mv.
-  //  2. The rate needed to encode the current fullpel_mv is larger than that
-  //     for the other ref_mv.
-  int skip_repeated_full_newmv;
+  // Skip the current ref_mv in NEW_MV mode based on mv, rate cost, etc.
+  // This speed feature equaling 0 means no skipping.
+  // If the speed feature equals 1 or 2, skip the current ref_mv in NEW_MV mode
+  // if we have already encountered ref_mv in the drl such that:
+  //  1. The other drl has the same mv during the SIMPLE_TRANSLATION search
+  //     process as the current mv.
+  //  2. The rate needed to encode the current mv is larger than that for the
+  //     other ref_mv.
+  // The speed feature equaling 1 means using subpel mv in the comparison.
+  // The speed feature equaling 2 means using fullpel mv in the comparison.
+  // If the speed feature >= 3, skip the current ref_mv in NEW_MV mode based on
+  // known full_mv bestsme and drl cost.
+  int skip_newmv_in_drl;
 
   // This speed feature checks duplicate ref MVs among NEARESTMV, NEARMV,
   // GLOBALMV and skips NEARMV or GLOBALMV (in order) if a duplicate is found
@@ -680,12 +794,6 @@
   // same single inter mode as a group.
   int prune_comp_search_by_single_result;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
-
   // Instead of performing a full MV search, do a simple translation first
   // and only perform a full MV search on the motion vectors that performed
   // well.
@@ -696,12 +804,17 @@
   // the single reference modes, it is one of the two best performers.
   int prune_compound_using_single_ref;
 
-  // Skip extended compound mode using ref frames of above and left neighbor
+  // Skip extended compound mode (NEAREST_NEWMV, NEW_NEARESTMV, NEAR_NEWMV,
+  // NEW_NEARMV) using ref frames of above and left neighbor
   // blocks.
   // 0 : no pruning
-  // 1 : prune extended compound mode (less aggressiveness)
-  // 2 : prune extended compound mode (high aggressiveness)
-  int prune_compound_using_neighbors;
+  // 1 : prune ext compound modes using neighbor blocks (less aggressiveness)
+  // 2 : prune ext compound modes using neighbor blocks (high aggressiveness)
+  // 3 : prune ext compound modes unconditionally (highest aggressiveness)
+  int prune_ext_comp_using_neighbors;
+
+  // Skip NEW_NEARMV and NEAR_NEWMV extended compound modes
+  int skip_ext_comp_nearmv_mode;
 
   // Skip extended compound mode when ref frame corresponding to NEWMV does not
   // have NEWMV as single mode winner.
@@ -710,6 +823,9 @@
   // 2 : prune extended compound mode (high aggressiveness)
   int prune_comp_using_best_single_mode_ref;
 
+  // Skip NEARESTMV and NEARMV using weight computed in ref mv list population
+  int prune_nearest_near_mv_using_refmv_weight;
+
   // Based on previous ref_mv_idx search result, prune the following search.
   int prune_ref_mv_idx_search;
 
@@ -744,6 +860,12 @@
   // Clip the frequency of updating the mv cost.
   INTERNAL_COST_UPDATE_TYPE mv_cost_upd_level;
 
+  // Clip the frequency of updating the coeff cost.
+  INTERNAL_COST_UPDATE_TYPE coeff_cost_upd_level;
+
+  // Clip the frequency of updating the mode cost.
+  INTERNAL_COST_UPDATE_TYPE mode_cost_upd_level;
+
   // Prune inter modes based on tpl stats
   // 0 : no pruning
   // 1 - 3 indicate increasing aggressiveness in order.
@@ -751,7 +873,7 @@
 
   // Skip NEARMV and NEAR_NEARMV modes using ref frames of above and left
   // neighbor blocks and qindex.
-  int prune_nearmv_using_neighbors;
+  PRUNE_NEARMV_LEVEL prune_nearmv_using_neighbors;
 
   // Model based breakout after interpolation filter search
   // 0: no breakout
@@ -771,6 +893,25 @@
 
   // Reuse masked compound type search results
   int reuse_mask_search_results;
+
+  // Enable/disable fast search for wedge masks
+  int enable_fast_wedge_mask_search;
+
+  // Early breakout from transform search of inter modes
+  int inter_mode_txfm_breakout;
+
+  // Limit number of inter modes for txfm search if a newmv mode gets
+  // evaluated among the top modes.
+  // 0: no pruning
+  // 1 to 3 indicate increasing order of aggressiveness
+  int limit_inter_mode_cands;
+
+  // Cap the no. of txfm searches for a given prediction mode.
+  // 0: no cap, 1: cap beyond first 4 searches, 2: cap beyond first 3 searches.
+  int limit_txfm_eval_per_mode;
+
+  // Prune warped motion search based on block size.
+  int extra_prune_warped;
 } INTER_MODE_SPEED_FEATURES;
 
 typedef struct INTERP_FILTER_SPEED_FEATURES {
@@ -820,8 +961,11 @@
   // Enable/disable smooth intra modes.
   int disable_smooth_intra;
 
-  // Enable/disable filter intra modes.
-  int disable_filter_intra;
+  // Prune filter intra modes in intra frames.
+  // 0 : No pruning
+  // 1 : Evaluate applicable filter intra modes based on best intra mode so far
+  // 2 : Do not evaluate filter intra modes
+  int prune_filter_intra_level;
 
   // prune palette search
   // 0: No pruning
@@ -832,6 +976,26 @@
   // palette colors is not the winner.
   int prune_palette_search_level;
 
+  // Terminate early in luma palette_size search. Speed feature values indicate
+  // increasing level of pruning.
+  // 0: No early termination
+  // 1: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than 2 * best_rd. This level of pruning is more
+  // conservative when compared to sf level 2 as the cases which will get pruned
+  // with sf level 1 is a subset of the cases which will get pruned with sf
+  // level 2.
+  // 2: Terminate early for higher luma palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 2.49%, 1.07%,
+  // 2.76%, 2.30%, 1.84%, 2.69%, 2.04%, 2.05% and 1.44% for speed 0, 1, 2, 3, 4,
+  // 5, 6, 7 and 8 on screen content set with coding performance change less
+  // than 0.01% for speed <= 2 and less than 0.03% for speed >= 3. For AVIF
+  // image encode, this sf reduces instruction count by 1.94%, 1.13%, 1.29%,
+  // 0.93%, 0.89%, 1.03%, 1.07%, 1.20% and 0.18% for speed 0, 1, 2, 3, 4, 5, 6,
+  // 7 and 8 on a typical image dataset with coding performance change less than
+  // 0.01%.
+  int prune_luma_palette_size_search_level;
+
   // Prune chroma intra modes based on luma intra mode winner.
   // 0: No pruning
   // 1: Prune chroma intra modes other than UV_DC_PRED, UV_SMOOTH_PRED,
@@ -853,6 +1017,27 @@
   // 33: Exhaustive rd search (33 == CFL_MAGS_SIZE). This mode should only
   // be used for debugging purpose.
   int cfl_search_range;
+
+  // TOP_INTRA_MODEL_COUNT is 4 that is the number of top model rd to store in
+  // intra mode decision. Here, add a speed feature to reduce this number for
+  // higher speeds.
+  int top_intra_model_count_allowed;
+
+  // Terminate early in chroma palette_size search.
+  // 0: No early termination
+  // 1: Terminate early for higher palette_size, if header rd cost of lower
+  // palette_size is more than best_rd.
+  // For allintra encode, this sf reduces instruction count by 0.45%,
+  // 0.62%, 1.73%, 2.50%, 2.89%, 3.09% and 3.86% for speed 0 to 6 on screen
+  // content set with coding performance change less than 0.01%.
+  // For AVIF image encode, this sf reduces instruction count by 0.45%, 0.81%,
+  // 0.85%, 1.05%, 1.45%, 1.66% and 1.95% for speed 0 to 6 on a typical image
+  // dataset with no quality drop.
+  int early_term_chroma_palette_size_search;
+
+  // Skips the evaluation of filter intra modes in inter frames if rd evaluation
+  // of luma intra dc mode results in invalid rd stats.
+  int skip_filter_intra_in_inter_frames;
 } INTRA_MODE_SPEED_FEATURES;
 
 typedef struct TX_SPEED_FEATURES {
@@ -885,16 +1070,6 @@
   // 1-2: progressively increasing aggressiveness of pruning
   int model_based_prune_tx_search_level;
 
-  // Use hash table to store intra(keyframe only) txb transform search results
-  // to avoid repeated search on the same residue signal. This is currently not
-  // compatible with multi-winner mode as the hash states are reset during
-  // winner mode processing.
-  int use_intra_txb_hash;
-
-  // Use hash table to store inter txb transform search results
-  // to avoid repeated search on the same residue signal.
-  int use_inter_txb_hash;
-
   // Refine TX type after fast TX search.
   int refine_fast_tx_search_results;
 
@@ -944,6 +1119,7 @@
   // Level 0  : FULL RD     LARGEST ALL   FULL RD
   // Level 1  : FAST RD     LARGEST ALL   FULL RD
   // Level 2  : LARGEST ALL LARGEST ALL   FULL RD
+  // Level 3 :  LARGEST ALL LARGEST ALL   LARGEST ALL
   int tx_size_search_level;
 
   // Flag used to control the winner mode processing for use transform
@@ -986,6 +1162,9 @@
   // Disable loop restoration for Chroma plane
   int disable_loop_restoration_chroma;
 
+  // Disable loop restoration for luma plane
+  int disable_loop_restoration_luma;
+
   // Prune RESTORE_WIENER evaluation based on source variance
   // 0 : no pruning
   // 1 : conservative pruning
@@ -1045,6 +1224,10 @@
   // Use compound reference for non-RD mode.
   int use_comp_ref_nonrd;
 
+  // Reference frames for compound prediction for nonrd pickmode:
+  // LAST_GOLDEN (0), LAST_LAST2 (1), or LAST_ALTREF (2).
+  int ref_frame_comp_nonrd[3];
+
   // use reduced ref set for real-time mode
   int use_real_time_ref_set;
 
@@ -1110,6 +1293,18 @@
 
   // Skips mode checks more agressively in nonRD mode
   int nonrd_agressive_skip;
+
+  // Skip cdef on 64x64 blocks when NEWMV or INTRA is not picked or color
+  // sensitivity is off. When color sensitivity is on for a superblock, all
+  // 64x64 blocks within will not skip.
+  int skip_cdef_sb;
+
+  // Forces larger partition blocks in variance based partitioning for intra
+  // frames
+  int force_large_partition_blocks_intra;
+
+  // Skip evaluation of no split in tx size selection for merge partition
+  int skip_tx_no_split_var_based_partition;
 } REAL_TIME_SPEED_FEATURES;
 
 /*!\endcond */
@@ -1124,6 +1319,11 @@
   HIGH_LEVEL_SPEED_FEATURES hl_sf;
 
   /*!
+   * Speed features for the first pass.
+   */
+  FIRST_PASS_SPEED_FEATURES fp_sf;
+
+  /*!
    * Speed features related to how tpl's searches are done.
    */
   TPL_SPEED_FEATURES tpl_sf;

diff --git a/av1/encoder/superres_scale.c b/av1/encoder/superres_scale.c
index fb892de..283faab 100644
--- a/av1/encoder/superres_scale.c
+++ b/av1/encoder/superres_scale.c

@@ -80,7 +80,7 @@
   if (is_stat_generation_stage(cpi)) return SCALE_NUMERATOR;
   uint8_t new_denom = SCALE_NUMERATOR;
 
-  if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+  if (cpi->common.seq_params->reduced_still_picture_hdr) return SCALE_NUMERATOR;
   switch (resize_cfg->resize_mode) {
     case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
     case RESIZE_FIXED:
@@ -143,7 +143,7 @@
 static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex,
                                              int sr_kf, int sr_arf) {
   // Use superres for Key-frames and Alt-ref frames only.
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   if (gf_group->update_type[cpi->gf_frame_index] != KF_UPDATE &&
       gf_group->update_type[cpi->gf_frame_index] != ARF_UPDATE) {
     return SCALE_NUMERATOR;
@@ -168,7 +168,7 @@
   printf("]\n");
   printf("boost = %d\n",
          (gf_group->update_type[cpi->gf_frame_index] == KF_UPDATE)
-             ? cpi->rc.kf_boost
+             ? cpi->ppi->p_rc.kf_boost
              : cpi->rc.gfu_boost);
   printf("denom = %d\n", denom);
   */
@@ -195,8 +195,8 @@
   // Make sure that superres mode of the frame is consistent with the
   // sequence-level flag.
   assert(IMPLIES(superres_cfg->superres_mode != AOM_SUPERRES_NONE,
-                 cpi->common.seq_params.enable_superres));
-  assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+                 cpi->common.seq_params->enable_superres));
+  assert(IMPLIES(!cpi->common.seq_params->enable_superres,
                  superres_cfg->superres_mode == AOM_SUPERRES_NONE));
   // Make sure that superres mode for current encoding is consistent with user
   // provided superres mode.
@@ -223,8 +223,8 @@
       // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
-          cpi->gf_frame_index, &bottom_index, &top_index);
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
 
       const int qthresh = (frame_is_intra_only(&cpi->common))
                               ? superres_cfg->superres_kf_qthresh
@@ -244,8 +244,8 @@
       // Now decide the use of superres based on 'q'.
       int bottom_index, top_index;
       const int q = av1_rc_pick_q_and_bounds(
-          cpi, &cpi->rc, frm_dim_cfg->width, frm_dim_cfg->height,
-          cpi->gf_frame_index, &bottom_index, &top_index);
+          cpi, frm_dim_cfg->width, frm_dim_cfg->height, cpi->gf_frame_index,
+          &bottom_index, &top_index);
 
       const SUPERRES_AUTO_SEARCH_TYPE sr_search_type =
           cpi->sf.hl_sf.superres_auto_search_type;
@@ -346,7 +346,7 @@
   size_params_type rsz = { frm_dim_cfg->width, frm_dim_cfg->height,
                            SCALE_NUMERATOR };
   int resize_denom = SCALE_NUMERATOR;
-  if (has_no_stats_stage(cpi) && cpi->use_svc &&
+  if (has_no_stats_stage(cpi) && cpi->ppi->use_svc &&
       cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) {
     rsz.resize_width = cpi->common.width;
     rsz.resize_height = cpi->common.height;

diff --git a/av1/encoder/svc_layercontext.c b/av1/encoder/svc_layercontext.c
index 8e3fbf0..4e48218 100644
--- a/av1/encoder/svc_layercontext.c
+++ b/av1/encoder/svc_layercontext.c

@@ -37,31 +37,31 @@
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
       lrc->ni_av_qi = oxcf->rc_cfg.worst_allowed_q;
-      lrc->total_actual_bits = 0;
+      lp_rc->total_actual_bits = 0;
       lrc->ni_tot_qi = 0;
-      lrc->tot_q = 0.0;
-      lrc->avg_q = 0.0;
-      lrc->ni_frames = 0;
+      lp_rc->tot_q = 0.0;
+      lp_rc->avg_q = 0.0;
+      lp_rc->ni_frames = 0;
       lrc->decimation_count = 0;
       lrc->decimation_factor = 0;
       lrc->worst_quality = av1_quantizer_to_qindex(lc->max_q);
       lrc->best_quality = av1_quantizer_to_qindex(lc->min_q);
       for (int i = 0; i < RATE_FACTOR_LEVELS; ++i) {
-        lrc->rate_correction_factors[i] = 1.0;
+        lp_rc->rate_correction_factors[i] = 1.0;
       }
       lc->target_bandwidth = lc->layer_target_bitrate;
-      lrc->last_q[INTER_FRAME] = lrc->worst_quality;
-      lrc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
-      lrc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
-      lrc->buffer_level =
+      lp_rc->last_q[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[INTER_FRAME] = lrc->worst_quality;
+      lp_rc->avg_frame_qindex[KEY_FRAME] = lrc->worst_quality;
+      lp_rc->buffer_level =
           oxcf->rc_cfg.starting_buffer_level_ms * lc->target_bandwidth / 1000;
-      lrc->bits_off_target = lrc->buffer_level;
+      lp_rc->bits_off_target = lp_rc->buffer_level;
       // Initialize the cyclic refresh parameters. If spatial layers are used
       // (i.e., ss_number_layers > 1), these need to be updated per spatial
       // layer. Cyclic refresh is only applied on base temporal layer.
       if (svc->number_spatial_layers > 1 && tl == 0) {
-        size_t last_coded_q_map_size;
         lc->sb_index = 0;
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
@@ -70,13 +70,6 @@
         CHECK_MEM_ERROR(cm, lc->map,
                         aom_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
-        last_coded_q_map_size =
-            mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
-                        aom_malloc(last_coded_q_map_size));
-        assert(MAXQ <= 255);
-        memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
       }
     }
     svc->downsample_filter_type[sl] = BILINEAR;
@@ -85,12 +78,16 @@
   if (svc->number_spatial_layers == 3) {
     svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH;
   }
+  svc->ref_frame_comp[0] = 0;
+  svc->ref_frame_comp[1] = 0;
+  svc->ref_frame_comp[2] = 0;
 }
 
 // Update the layer context from a change_config() call.
 void av1_update_layer_context_change_config(AV1_COMP *const cpi,
                                             const int64_t target_bandwidth) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc;
   SVC *const svc = &cpi->svc;
   int layer = 0;
   int64_t spatial_layer_target = 0;
@@ -107,17 +104,19 @@
       LAYER_CONTEXT *const lc =
           &svc->layer_context[sl * svc->number_temporal_layers + tl];
       RATE_CONTROL *const lrc = &lc->rc;
+      PRIMARY_RATE_CONTROL *const lp_rc = &lc->p_rc;
       lc->spatial_layer_target_bandwidth = spatial_layer_target;
       bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
-      lrc->starting_buffer_level =
-          (int64_t)(rc->starting_buffer_level * bitrate_alloc);
-      lrc->optimal_buffer_level =
-          (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
-      lrc->maximum_buffer_size =
-          (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
-      lrc->bits_off_target =
-          AOMMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
-      lrc->buffer_level = AOMMIN(lrc->buffer_level, lrc->maximum_buffer_size);
+      lp_rc->starting_buffer_level =
+          (int64_t)(p_rc->starting_buffer_level * bitrate_alloc);
+      lp_rc->optimal_buffer_level =
+          (int64_t)(p_rc->optimal_buffer_level * bitrate_alloc);
+      lp_rc->maximum_buffer_size =
+          (int64_t)(p_rc->maximum_buffer_size * bitrate_alloc);
+      lp_rc->bits_off_target =
+          AOMMIN(lp_rc->bits_off_target, lp_rc->maximum_buffer_size);
+      lp_rc->buffer_level =
+          AOMMIN(lp_rc->buffer_level, lp_rc->maximum_buffer_size);
       lc->framerate = cpi->framerate / lc->framerate_factor;
       lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
@@ -172,6 +171,7 @@
   const int old_frame_to_key = cpi->rc.frames_to_key;
   // Restore layer rate control.
   cpi->rc = lc->rc;
+  cpi->ppi->p_rc = lc->p_rc;
   cpi->oxcf.rc_cfg.target_bandwidth = lc->target_bandwidth;
   cpi->gf_frame_index = 0;
   cpi->mv_search_params.max_mv_magnitude = lc->max_mv_magnitude;
@@ -187,7 +187,6 @@
       svc->number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     swap_ptr(&cr->map, &lc->map);
-    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
     cr->sb_index = lc->sb_index;
     cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
     cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
@@ -215,6 +214,7 @@
   const AV1_COMMON *const cm = &cpi->common;
   LAYER_CONTEXT *lc = get_layer_context(cpi);
   lc->rc = cpi->rc;
+  lc->p_rc = cpi->ppi->p_rc;
   lc->target_bandwidth = (int)cpi->oxcf.rc_cfg.target_bandwidth;
   lc->group_index = cpi->gf_frame_index;
   lc->max_mv_magnitude = cpi->mv_search_params.max_mv_magnitude;
@@ -225,11 +225,8 @@
       cpi->svc.number_spatial_layers > 1 && svc->temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
     signed char *temp = lc->map;
-    uint8_t *temp2 = lc->last_coded_q_map;
     lc->map = cr->map;
     cr->map = temp;
-    lc->last_coded_q_map = cr->last_coded_q_map;
-    cr->last_coded_q_map = temp2;
     lc->sb_index = cr->sb_index;
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
@@ -292,7 +289,6 @@
       int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       if (lc->map) aom_free(lc->map);
-      if (lc->last_coded_q_map) aom_free(lc->last_coded_q_map);
     }
   }
 }
@@ -363,18 +359,17 @@
          svc->number_temporal_layers >= 1 && svc->number_temporal_layers <= 3);
   svc->set_ref_frame_config = 1;
   int superframe_cnt = svc->current_superframe;
-  // Set the referende map buffer idx for the 7 references:
+  // Set the reference map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->ref_idx[i] = i;
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) svc->reference[i] = 0;
   for (i = 0; i < REF_FRAMES; i++) svc->refresh[i] = 0;
-  // Always reference LAST, and reference GOLDEN on SL > 0 for non-ksvc.
+  // Always reference LAST, and reference GOLDEN on SL > 0.
+  // For KSVC: GOLDEN reference will be removed on INTER_FRAMES later
+  // when frame_type is set.
   svc->reference[SVC_LAST_FRAME] = 1;
-  if (svc->spatial_layer_id > 0 &&
-      (!svc->ksvc_fixed_mode ||
-       cpi->common.current_frame.frame_type == KEY_FRAME))
-    svc->reference[SVC_GOLDEN_FRAME] = 1;
+  if (svc->spatial_layer_id > 0) svc->reference[SVC_GOLDEN_FRAME] = 1;
   if (svc->temporal_layer_id == 0) {
     // Base temporal layer.
     if (svc->spatial_layer_id == 0) {
@@ -485,3 +480,29 @@
     }
   }
 }
+
+void av1_svc_check_reset_layer_rc_flag(AV1_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  for (int sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->prev_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->prev_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (int tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer2 = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc2 = &svc->layer_context[layer2];
+        RATE_CONTROL *lrc2 = &lc2->rc;
+        PRIMARY_RATE_CONTROL *lp_rc2 = &lc2->p_rc;
+        PRIMARY_RATE_CONTROL *const lp_rc = &lc2->p_rc;
+        lrc2->rc_1_frame = 0;
+        lrc2->rc_2_frame = 0;
+        lp_rc2->bits_off_target = lp_rc->optimal_buffer_level;
+        lp_rc2->buffer_level = lp_rc->optimal_buffer_level;
+      }
+    }
+  }
+}

diff --git a/av1/encoder/svc_layercontext.h b/av1/encoder/svc_layercontext.h
index 44e13d6..310d08a 100644
--- a/av1/encoder/svc_layercontext.h
+++ b/av1/encoder/svc_layercontext.h

@@ -26,6 +26,7 @@
 typedef struct {
   /*!\cond */
   RATE_CONTROL rc;
+  PRIMARY_RATE_CONTROL p_rc;
   int framerate_factor;
   int64_t layer_target_bitrate;
   int scaling_factor_num;
@@ -48,11 +49,6 @@
    */
   int8_t *map;
   /*!
-   * Segmentation map for last coded quantization paramters.
-   */
-  uint8_t *last_coded_q_map;
-
-  /*!
    * Number of blocks on segment 1
    */
   int actual_num_seg1_blocks;
@@ -98,6 +94,7 @@
   int non_reference_frame;
   int use_flexible_mode;
   int ksvc_fixed_mode;
+  int ref_frame_comp[3];
   /*!\endcond */
 
   /*!
@@ -108,6 +105,7 @@
   /*!\cond */
   int ref_idx[INTER_REFS_PER_FRAME];
   int refresh[REF_FRAMES];
+  int gld_idx_1layer;
   double base_framerate;
   unsigned int current_superframe;
   unsigned int buffer_time_index[REF_FRAMES];
@@ -276,6 +274,8 @@
 
 void av1_set_svc_fixed_mode(struct AV1_COMP *const cpi);
 
+void av1_svc_check_reset_layer_rc_flag(struct AV1_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 8e09880..b229d43 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -14,9 +14,14 @@
 
 #include "config/aom_config.h"
 
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
 #include "av1/common/alloccommon.h"
 #include "av1/common/av1_common_int.h"
-#include "av1/common/odintrin.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"
 #include "av1/encoder/av1_quantize.h"
@@ -30,12 +35,6 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/temporal_filter.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-#include "aom_scale/aom_scale.h"
 
 /*!\cond */
 
@@ -155,7 +154,7 @@
     best_mv.as_mv.row = GET_MV_SUBPEL(mv_row);
     best_mv.as_mv.col = GET_MV_SUBPEL(mv_col);
     const int mv_offset = mv_row * y_stride + mv_col;
-    error = cpi->fn_ptr[block_size].vf(
+    error = cpi->ppi->fn_ptr[block_size].vf(
         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
@@ -561,9 +560,16 @@
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
   // Decay factors for non-local mean approach.
   double decay_factor[MAX_MB_PLANE] = { 0 };
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -745,10 +751,11 @@
 }
 
 int av1_get_q(const AV1_COMP *cpi) {
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
-  const int q = (int)av1_convert_qindex_to_q(
-      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+  const int q =
+      (int)av1_convert_qindex_to_q(cpi->ppi->p_rc.avg_frame_qindex[frame_type],
+                                   cpi->common.seq_params->bit_depth);
   return q;
 }
 
@@ -818,7 +825,6 @@
 
         // All variants of av1_apply_temporal_filter() contain floating point
         // operations. Hence, clear the system state.
-        aom_clear_system_state();
 
         // TODO(any): avx2/sse2 version should be changed to align with C
         // function before using. In particular, current avx2/sse2 function
@@ -855,23 +861,24 @@
       }
     }
     tf_normalize_filtered_frame(mbd, block_size, mb_row, mb_col, num_planes,
-                                accum, count, &cpi->alt_ref_buffer);
+                                accum, count, tf_ctx->output_frame);
 
     if (check_show_existing) {
       const int y_height = mb_height >> mbd->plane[0].subsampling_y;
       const int y_width = mb_width >> mbd->plane[0].subsampling_x;
       const int source_y_stride = frame_to_filter->y_stride;
-      const int filter_y_stride = cpi->alt_ref_buffer.y_stride;
+      const int filter_y_stride = tf_ctx->output_frame->y_stride;
       const int source_offset =
           mb_row * y_height * source_y_stride + mb_col * y_width;
       const int filter_offset =
           mb_row * y_height * filter_y_stride + mb_col * y_width;
       unsigned int sse = 0;
-      cpi->fn_ptr[block_size].vf(
+      cpi->ppi->fn_ptr[block_size].vf(
           frame_to_filter->y_buffer + source_offset, source_y_stride,
-          cpi->alt_ref_buffer.y_buffer + filter_offset, filter_y_stride, &sse);
+          tf_ctx->output_frame->y_buffer + filter_offset, filter_y_stride,
+          &sse);
       diff->sum += sse;
-      diff->sse += sse * sse;
+      diff->sse += sse * (int64_t)sse;
     }
   }
 }
@@ -939,8 +946,9 @@
   const int lookahead_depth =
       av1_lookahead_depth(cpi->ppi->lookahead, cpi->compressor_stage);
 
-  int arf_src_offset = cpi->gf_group.arf_src_offset[cpi->gf_frame_index];
-  const FRAME_TYPE frame_type = cpi->gf_group.frame_type[cpi->gf_frame_index];
+  int arf_src_offset = cpi->ppi->gf_group.arf_src_offset[cpi->gf_frame_index];
+  const FRAME_TYPE frame_type =
+      cpi->ppi->gf_group.frame_type[cpi->gf_frame_index];
 
   // Temporal filtering should not go beyond key frames
   const int key_to_curframe =
@@ -949,10 +957,10 @@
       AOMMAX(cpi->rc.frames_to_key - arf_src_offset - 1, 0);
 
   // Number of buffered frames before the to-filter frame.
-  const int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
+  int max_before = AOMMIN(filter_frame_lookahead_idx, key_to_curframe);
 
   // Number of buffered frames after the to-filter frame.
-  const int max_after =
+  int max_after =
       AOMMIN(lookahead_depth - filter_frame_lookahead_idx - 1, curframe_to_key);
 
   // Estimate noises for each plane.
@@ -964,26 +972,34 @@
   double *noise_levels = tf_ctx->noise_levels;
   for (int plane = 0; plane < num_planes; ++plane) {
     noise_levels[plane] = av1_estimate_noise_from_single_plane(
-        to_filter_frame, plane, cpi->common.seq_params.bit_depth);
+        to_filter_frame, plane, cpi->common.seq_params->bit_depth);
   }
   // Get quantization factor.
   const int q = av1_get_q(cpi);
-  // Get correlation estimates from first-pass
-  RATE_CONTROL *rc = &cpi->rc;
-  const double *coeff = rc->cor_coeff;
-  const int offset = rc->regions_offset;
-  int cur_frame_idx =
-      filter_frame_lookahead_idx + rc->frames_since_key - offset;
-
+  // Get correlation estimates from first-pass;
+  const FIRSTPASS_STATS *stats =
+      cpi->twopass_frame.stats_in - (cpi->rc.frames_since_key == 0);
   double accu_coeff0 = 1.0, accu_coeff1 = 1.0;
   for (int i = 1; i <= max_after; i++) {
-    accu_coeff1 *= coeff[cur_frame_idx + i];
+    if (stats + filter_frame_lookahead_idx + i >=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_end) {
+      max_after = i - 1;
+      break;
+    }
+    accu_coeff1 *=
+        AOMMAX(stats[filter_frame_lookahead_idx + i].cor_coeff, 0.001);
   }
   if (max_after >= 1) {
     accu_coeff1 = pow(accu_coeff1, 1.0 / (double)max_after);
   }
   for (int i = 1; i <= max_before; i++) {
-    accu_coeff0 *= coeff[cur_frame_idx - i + 1];
+    if (stats + filter_frame_lookahead_idx - i + 1 <=
+        cpi->ppi->twopass.stats_buf_ctx->stats_in_start) {
+      max_before = i - 1;
+      break;
+    }
+    accu_coeff0 *=
+        AOMMAX(stats[filter_frame_lookahead_idx - i + 1].cor_coeff, 0.001);
   }
   if (max_before >= 1) {
     accu_coeff0 = pow(accu_coeff0, 1.0 / (double)max_before);
@@ -996,19 +1012,16 @@
   int adjust_num = 6;
   if (num_frames == 1) {  // `arnr_max_frames = 1` is used to disable filtering.
     adjust_num = 0;
-  } else if ((update_type == KF_UPDATE || is_forward_keyframe) && q <= 10) {
+  } else if ((update_type == KF_UPDATE) && q <= 10) {
     adjust_num = 0;
   }
   num_frames = AOMMIN(num_frames + adjust_num, lookahead_depth);
 
-  if (frame_type == KEY_FRAME && !is_forward_keyframe) {
-    num_before = 0;
+  if (frame_type == KEY_FRAME) {
+    num_before = is_forward_keyframe ? num_frames / 2 : 0;
     num_after = AOMMIN(num_frames - 1, max_after);
-  } else if (is_forward_keyframe) {  // Key frame in one-pass mode.
-    num_before = AOMMIN(num_frames - 1, max_before);
-    num_after = 0;
   } else {
-    num_frames = AOMMIN(num_frames, cpi->rc.gfu_boost / 150);
+    num_frames = AOMMIN(num_frames, cpi->ppi->p_rc.gfu_boost / 150);
     num_frames += !(num_frames & 1);  // Make the number odd.
     // Only use 2 neighbours for the second ARF.
     if (is_second_arf) num_frames = AOMMIN(num_frames, 3);
@@ -1051,10 +1064,10 @@
   assert(frames[tf_ctx->filter_frame_idx] == to_filter_frame);
 
   av1_setup_src_planes(&cpi->td.mb, &to_filter_buf->img, 0, 0, num_planes,
-                       cpi->common.seq_params.sb_size);
+                       cpi->common.seq_params->sb_size);
   av1_setup_block_planes(&cpi->td.mb.e_mbd,
-                         cpi->common.seq_params.subsampling_x,
-                         cpi->common.seq_params.subsampling_y, num_planes);
+                         cpi->common.seq_params->subsampling_x,
+                         cpi->common.seq_params->subsampling_y, num_planes);
 }
 
 /*!\cond */
@@ -1117,12 +1130,14 @@
 //   Nothing will be returned. But the contents of cpi->tf_ctx will be modified.
 static void init_tf_ctx(AV1_COMP *cpi, int filter_frame_lookahead_idx,
                         int is_second_arf, FRAME_UPDATE_TYPE update_type,
-                        int is_forward_keyframe) {
+                        int is_forward_keyframe,
+                        YV12_BUFFER_CONFIG *output_frame) {
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   // Setup frame buffer for filtering.
   YV12_BUFFER_CONFIG **frames = tf_ctx->frames;
   tf_ctx->num_frames = 0;
   tf_ctx->filter_frame_idx = -1;
+  tf_ctx->output_frame = output_frame;
   tf_setup_filtering_buffer(cpi, filter_frame_lookahead_idx, is_second_arf,
                             update_type, is_forward_keyframe);
   assert(tf_ctx->num_frames > 0);
@@ -1171,18 +1186,20 @@
 
 int av1_temporal_filter(AV1_COMP *cpi, const int filter_frame_lookahead_idx,
                         FRAME_UPDATE_TYPE update_type, int is_forward_keyframe,
-                        int *show_existing_arf) {
+                        int *show_existing_arf,
+                        YV12_BUFFER_CONFIG *output_frame) {
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   // Basic informaton of the current frame.
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const uint8_t group_idx = cpi->gf_frame_index;
   TemporalFilterCtx *tf_ctx = &cpi->tf_ctx;
   TemporalFilterData *tf_data = &cpi->td.tf_data;
   // Filter one more ARF if the lookahead index is leq 7 (w.r.t. 9-th frame).
   // This frame is ALWAYS a show existing frame.
   const int is_second_arf =
-      (update_type == INTNL_ARF_UPDATE) && (filter_frame_lookahead_idx >= 7) &&
-      (is_forward_keyframe == 0) && cpi->sf.hl_sf.second_alt_ref_filtering;
+      (update_type == INTNL_ARF_UPDATE) &&
+      (filter_frame_lookahead_idx >= TF_LOOKAHEAD_IDX_THR) &&
+      cpi->sf.hl_sf.second_alt_ref_filtering;
   // TODO(anyone): Currently, we enforce the filtering strength on internal
   // ARFs except the second ARF to be zero. We should investigate in which case
   // it is more beneficial to use non-zero strength filtering.
@@ -1190,9 +1207,14 @@
     return 0;
   }
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  // Only parallel level 0 frames go through temporal filtering.
+  assert(gf_group->frame_parallel_level[group_idx] == 0);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
   // Initialize temporal filter context structure.
   init_tf_ctx(cpi, filter_frame_lookahead_idx, is_second_arf, update_type,
-              is_forward_keyframe);
+              is_forward_keyframe, output_frame);
 
   // Set showable frame.
   if (is_forward_keyframe == 0 && update_type != KF_UPDATE) {
@@ -1229,16 +1251,15 @@
     const float mean = (float)diff->sum / num_mbs;
     const float std = (float)sqrt((float)diff->sse / num_mbs - mean * mean);
 
-    aom_clear_system_state();
     // TODO(yunqing): This can be combined with TPL q calculation later.
     cpi->rc.base_frame_target = gf_group->bit_allocation[group_idx];
     av1_set_target_rate(cpi, cpi->common.width, cpi->common.height);
     int top_index = 0;
     int bottom_index = 0;
     const int q = av1_rc_pick_q_and_bounds(
-        cpi, &cpi->rc, cpi->oxcf.frm_dim_cfg.width,
-        cpi->oxcf.frm_dim_cfg.height, group_idx, &bottom_index, &top_index);
-    const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params.bit_depth);
+        cpi, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height,
+        group_idx, &bottom_index, &top_index);
+    const int ac_q = av1_ac_quant_QTX(q, 0, cpi->common.seq_params->bit_depth);
     const float threshold = 0.7f * ac_q * ac_q;
 
     if (!is_second_arf) {

diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 2ae7dd4..bc9ff5c 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h

@@ -64,6 +64,14 @@
 //    then the actual threshold will be 720 * 0.1 = 72. Similarly, the threshold
 //    for 360p videos will be 360 * 0.1 = 36.
 #define TF_SEARCH_DISTANCE_THRESHOLD 0.1
+// 6. Threshold to identify if the q is in a relative high range.
+//    Above this cutoff q, a stronger filtering is applied.
+//    For a high q, the quantization throws away more information, and thus a
+//    stronger filtering is less likely to distort the encoded quality, while a
+//    stronger filtering could reduce bit rates.
+//    Ror a low q, more details are expected to be retained. Filtering is thus
+//    more conservative.
+#define TF_QINDEX_CUTOFF 128
 
 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50
 
@@ -81,6 +89,12 @@
    * Number of frames in the frame buffer.
    */
   int num_frames;
+
+  /*!
+   * Output filtered frame
+   */
+  YV12_BUFFER_CONFIG *output_frame;
+
   /*!
    * Index of the frame to be filtered.
    */
@@ -199,21 +213,22 @@
  * -15 means to filter the 17-th frame, which is a key frame in one-pass mode.
  *
  * \ingroup src_frame_proc
- * \param[in]   cpi                        Top level encoder instance structure
- * \param[in]   filter_frame_lookahead_idx The index of the to-filter frame in
- *                                         the lookahead buffer cpi->lookahead.
- * \param[in]   update_type                This frame's update type.
- * \param[in]   is_forward_keyframe        Indicate whether this is a forward
- *                                         keyframe.
- * \param[in,out]   show_existing_arf      Whether to show existing ARF. This
- *                                         field is updated in this function.
+ * \param[in]      cpi                        Top level encoder instance
+ * structure \param[in]      filter_frame_lookahead_idx The index of the
+ * to-filter frame in the lookahead buffer cpi->lookahead. \param[in]
+ * update_type                This frame's update type. \param[in]
+ * is_forward_keyframe        Indicate whether this is a forward keyframe.
+ * \param[in,out]  show_existing_arf          Whether to show existing ARF. This
+ *                                            field is updated in this function.
+ * \param[out]     output_frame               Ouput filtered frame.
  *
  * \return Whether temporal filtering is successfully done.
  */
 int av1_temporal_filter(struct AV1_COMP *cpi,
                         const int filter_frame_lookahead_idx,
                         FRAME_UPDATE_TYPE update_type, int is_forward_keyframe,
-                        int *show_existing_arf);
+                        int *show_existing_arf,
+                        YV12_BUFFER_CONFIG *output_frame);
 
 /*!\cond */
 // Helper function to get `q` used for encoding.
@@ -276,11 +291,6 @@
   aom_free(tf_data->pred);
 }
 
-// Helper function to compute number of blocks on either side of the frame.
-static INLINE int get_num_blocks(const int frame_length, const int mb_length) {
-  return (frame_length + mb_length - 1) / mb_length;
-}
-
 // Saves the state prior to temporal filter process.
 // Inputs:
 //   mbd: Pointer to the block for filtering.

diff --git a/av1/encoder/thirdpass.c b/av1/encoder/thirdpass.c
new file mode 100644
index 0000000..1647758
--- /dev/null
+++ b/av1/encoder/thirdpass.c

@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_codec.h"
+#include "aom/aomdx.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/thirdpass.h"
+#include "av1/common/blockd.h"
+
+#if CONFIG_THREE_PASS
+#include "common/ivfdec.h"
+#endif
+
+#if CONFIG_THREE_PASS
+static void setup_two_pass_stream_input(
+    struct AvxInputContext **input_ctx_ptr, const char *input_file_name,
+    struct aom_internal_error_info *err_info) {
+  FILE *infile;
+  infile = fopen(input_file_name, "rb");
+  if (!infile) {
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Failed to open input file '%s'.", input_file_name);
+  }
+  struct AvxInputContext *aom_input_ctx = aom_malloc(sizeof(*aom_input_ctx));
+  if (!aom_input_ctx) {
+    fclose(infile);
+    aom_internal_error(err_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate memory for third-pass context.");
+  }
+  memset(aom_input_ctx, 0, sizeof(*aom_input_ctx));
+  aom_input_ctx->filename = input_file_name;
+  aom_input_ctx->file = infile;
+
+  if (file_is_ivf(aom_input_ctx)) {
+    aom_input_ctx->file_type = FILE_TYPE_IVF;
+  } else {
+    fclose(infile);
+    aom_free(aom_input_ctx);
+    aom_internal_error(err_info, AOM_CODEC_INVALID_PARAM,
+                       "Unrecognized input file type.");
+  }
+  *input_ctx_ptr = aom_input_ctx;
+}
+
+static void init_third_pass(THIRD_PASS_DEC_CTX *ctx) {
+  if (!ctx->input_ctx) {
+    if (ctx->input_file_name == NULL) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_INVALID_PARAM,
+                         "No third pass input specified.");
+    }
+    setup_two_pass_stream_input(&ctx->input_ctx, ctx->input_file_name,
+                                ctx->err_info);
+  }
+
+#if CONFIG_AV1_DECODER
+  if (!ctx->decoder.iface) {
+    aom_codec_iface_t *decoder_iface = &aom_codec_av1_inspect_algo;
+    if (aom_codec_dec_init(&ctx->decoder, decoder_iface, NULL, 0)) {
+      aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                         "Failed to initialize decoder.");
+    }
+  }
+#else
+  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                     "To utilize three-pass encoding, libaom must be built "
+                     "with CONFIG_AV1_DECODER=1.");
+#endif
+}
+#endif  // CONFIG_THREE_PASS
+
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int read_frame(THIRD_PASS_DEC_CTX *ctx) {
+#if CONFIG_THREE_PASS
+  if (!ctx->input_ctx || !ctx->decoder.iface) {
+    init_third_pass(ctx);
+  }
+  if (!ctx->have_frame) {
+    if (ivf_read_frame(ctx->input_ctx->file, &ctx->buf, &ctx->bytes_in_buffer,
+                       &ctx->buffer_size, NULL) != 0) {
+      if (feof(ctx->input_ctx->file)) {
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+    ctx->frame = ctx->buf;
+    ctx->end_frame = ctx->frame + ctx->bytes_in_buffer;
+    ctx->have_frame = 1;
+  }
+#else
+  aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                     "Cannot parse bitstream without CONFIG_THREE_PASS.");
+#endif
+  Av1DecodeReturn adr;
+  if (aom_codec_decode(&ctx->decoder, ctx->frame,
+                       (unsigned int)ctx->bytes_in_buffer,
+                       &adr) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to decode frame for third pass.");
+  }
+  ctx->frame = adr.buf;
+  ctx->bytes_in_buffer = ctx->end_frame - ctx->frame;
+  if (ctx->frame == ctx->end_frame) ctx->have_frame = 0;
+  return 0;
+}
+
+// This function gets the information needed from the recently decoded frame,
+// via various decoder APIs, and saves the info into ctx->frame_info.
+// Return 0: success
+//        1: cannot read because this is end of file
+//       -1: failure to read the frame
+static int get_frame_info(THIRD_PASS_DEC_CTX *ctx) {
+  int ret = read_frame(ctx);
+  if (ret != 0) return ret;
+  int cur = ctx->frame_info_count;
+  if (cur >= MAX_THIRD_PASS_BUF) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Third pass frame info ran out of available slots.");
+  }
+  int frame_type_flags = 0;
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_FRAME_FLAGS,
+                        &frame_type_flags) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read frame flags.");
+  }
+  if (frame_type_flags & AOM_FRAME_IS_KEY) {
+    ctx->frame_info[cur].frame_type = KEY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_INTRAONLY) {
+    ctx->frame_info[cur].frame_type = INTRA_ONLY_FRAME;
+  } else if (frame_type_flags & AOM_FRAME_IS_SWITCH) {
+    ctx->frame_info[cur].frame_type = S_FRAME;
+  } else {
+    ctx->frame_info[cur].frame_type = INTER_FRAME;
+  }
+
+  // Get frame base q idx
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_BASE_Q_IDX,
+                        &ctx->frame_info[cur].base_q_idx) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read base q index.");
+  }
+
+  // Get show existing frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_EXISTING_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_existing_frame) !=
+      AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show existing frame flag.");
+  }
+
+  // Get show frame flag
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_SHOW_FRAME_FLAG,
+                        &ctx->frame_info[cur].is_show_frame) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read show frame flag.");
+  }
+
+  // Get order hint
+  if (aom_codec_control(&ctx->decoder, AOMD_GET_ORDER_HINT,
+                        &ctx->frame_info[cur].order_hint) != AOM_CODEC_OK) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to read order hint.");
+  }
+  ctx->frame_info_count++;
+  return 0;
+}
+
+// Parse the frames in the gop and determine the last frame of the current GOP.
+// Decode more frames if necessary. The variable max_num is the maximum static
+// GOP length if we detect an IPPP structure, and it is expected that max_mum >=
+// MAX_GF_INTERVAL.
+static void get_current_gop_end(THIRD_PASS_DEC_CTX *ctx, int max_num,
+                                int *last_idx) {
+  assert(max_num >= MAX_GF_INTERVAL);
+  *last_idx = 0;
+  int cur_idx = 0;
+  int arf_order_hint = -1;
+  int num_show_frames = 0;
+  while (num_show_frames < max_num) {
+    assert(cur_idx < MAX_THIRD_PASS_BUF);
+    // Read in from bitstream if needed.
+    if (cur_idx >= ctx->frame_info_count) {
+      int ret = get_frame_info(ctx);
+      if (ret == 1) {
+        // At the end of the file, GOP ends in the prev frame.
+        if (arf_order_hint >= 0) {
+          aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                             "Failed to derive GOP length.");
+        }
+        *last_idx = cur_idx - 1;
+        return;
+      }
+      if (ret < 0) {
+        aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                           "Failed to read frame for third pass.");
+      }
+    }
+
+    // TODO(bohanli): verify that fwd_kf works here.
+    if (ctx->frame_info[cur_idx].frame_type == KEY_FRAME &&
+        ctx->frame_info[cur_idx].is_show_frame) {
+      if (cur_idx != 0) {
+        // If this is a key frame and is not the first kf in this kf group, we
+        // have reached the next key frame. Stop here.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (!ctx->frame_info[cur_idx].is_show_frame &&
+               arf_order_hint == -1) {
+      // If this is an arf (the first no show)
+      if (num_show_frames <= 1) {
+        // This is an arf and we should end the GOP with its overlay.
+        arf_order_hint = ctx->frame_info[cur_idx].order_hint;
+      } else {
+        // There are multiple show frames before the this arf, so we treat the
+        // frames previous to this arf as a GOP.
+        *last_idx = cur_idx - 1;
+        return;
+      }
+    } else if (arf_order_hint >= 0 && ctx->frame_info[cur_idx].order_hint ==
+                                          (unsigned int)arf_order_hint) {
+      // If this is the overlay/show existing of the arf
+      assert(ctx->frame_info[cur_idx].is_show_frame);
+      *last_idx = cur_idx;
+      return;
+    } else {
+      // This frame is part of the GOP.
+      if (ctx->frame_info[cur_idx].is_show_frame) num_show_frames++;
+    }
+    cur_idx++;
+  }
+  // This is a long IPPP GOP and we will use a length of max_num here.
+  assert(arf_order_hint < 0);
+  *last_idx = max_num - 1;
+  return;
+}
+
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx, GF_GROUP *gf_group,
+                            int order_hint_bits, int *gf_len) {
+  // Read in future frames and find the last frame in the current GOP.
+  int last_idx;
+  get_current_gop_end(ctx, MAX_GF_INTERVAL, &last_idx);
+
+  // Determine the GOP length.
+  // TODO(bohanli): Define and set the GOP structure here. Then we also
+  // dont't need to store prev_gop_end here.
+  (void)gf_group;
+  if (last_idx < 0) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "Failed to derive GOP length.");
+  }
+  *gf_len = ctx->frame_info[last_idx].order_hint - ctx->prev_gop_end;
+  *gf_len = (*gf_len + (1 << order_hint_bits)) % (1 << order_hint_bits);
+
+  ctx->prev_gop_end = ctx->frame_info[last_idx].order_hint;
+}
+
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx->frame_info_count == 0) {
+    aom_internal_error(ctx->err_info, AOM_CODEC_ERROR,
+                       "No available frame info for third pass.");
+  }
+  ctx->frame_info_count--;
+  for (int i = 0; i < ctx->frame_info_count; i++) {
+    ctx->frame_info[i] = ctx->frame_info[i + 1];
+  }
+}
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file) {
+  av1_free_thirdpass_ctx(*ctx);
+  CHECK_MEM_ERROR(cm, *ctx, aom_calloc(1, sizeof(**ctx)));
+  THIRD_PASS_DEC_CTX *ctx_ptr = *ctx;
+  ctx_ptr->input_file_name = file;
+  ctx_ptr->prev_gop_end = -1;
+  ctx_ptr->err_info = cm->error;
+}
+
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx) {
+  if (ctx == NULL) return;
+  if (ctx->decoder.iface) {
+    aom_codec_destroy(&ctx->decoder);
+  }
+#if CONFIG_THREE_PASS
+  if (ctx->input_ctx && ctx->input_ctx->file) fclose(ctx->input_ctx->file);
+  aom_free(ctx->input_ctx);
+#endif
+  if (ctx->buf) free(ctx->buf);
+  aom_free(ctx);
+}

diff --git a/av1/encoder/thirdpass.h b/av1/encoder/thirdpass.h
new file mode 100644
index 0000000..4c80585
--- /dev/null
+++ b/av1/encoder/thirdpass.h

@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_THIRDPASS_H_
+#define AOM_AV1_ENCODER_THIRDPASS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+
+// TODO(bohanli): optimize this number
+#define MAX_THIRD_PASS_BUF (2 * MAX_GF_INTERVAL + 1)
+
+// Struct to store useful information about a frame for the third pass.
+// The members are extracted from the decoder by function get_frame_info.
+typedef struct {
+  int base_q_idx;
+  int is_show_existing_frame;
+  int is_show_frame;
+  FRAME_TYPE frame_type;
+  unsigned int order_hint;
+} THIRD_PASS_FRAME_INFO;
+
+typedef struct {
+  /* --- Input and decoding related members --- */
+  // the input file
+  const char *input_file_name;
+#if CONFIG_THREE_PASS
+  // input context
+  struct AvxInputContext *input_ctx;
+#endif
+  // decoder codec context
+  aom_codec_ctx_t decoder;
+  // start of the frame in buf
+  const unsigned char *frame;
+  // end of the frame(s) in buf
+  const unsigned char *end_frame;
+  // whether we still have following frames in buf
+  int have_frame;
+  // pointer to buffer for the read frames
+  uint8_t *buf;
+  // size of data in buffer
+  size_t bytes_in_buffer;
+  // current buffer size
+  size_t buffer_size;
+  // error info pointer
+  struct aom_internal_error_info *err_info;
+
+  /* --- Members for third pass encoding --- */
+  // Array to store info about each frame.
+  // frame_info[0] should point to the current frame.
+  THIRD_PASS_FRAME_INFO frame_info[MAX_THIRD_PASS_BUF];
+  // number of frames available in frame_info
+  int frame_info_count;
+  // the end of the previous GOP (order hint)
+  int prev_gop_end;
+} THIRD_PASS_DEC_CTX;
+
+void av1_init_thirdpass_ctx(AV1_COMMON *cm, THIRD_PASS_DEC_CTX **ctx,
+                            const char *file);
+void av1_free_thirdpass_ctx(THIRD_PASS_DEC_CTX *ctx);
+
+// Set the GOP structure from the twopass bitstream.
+// TODO(bohanli): this is currently a skeleton and we only return the gop
+// length. This function also saves all frame information in the array
+// ctx->frame_info for this GOP.
+void av1_set_gop_third_pass(THIRD_PASS_DEC_CTX *ctx, GF_GROUP *gf_group,
+                            int order_hint_bits, int *gf_len);
+
+// Pop one frame out of the array ctx->frame_info. This function is used to make
+// sure that frame_info[0] always corresponds to the current frame.
+void av1_pop_third_pass_info(THIRD_PASS_DEC_CTX *ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AV1_ENCODER_THIRDPASS_H_

diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index 51eb28c..f31dc96 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h

@@ -119,8 +119,8 @@
 // Allocate memory for token related info.
 static AOM_INLINE void alloc_token_info(AV1_COMMON *cm, TokenInfo *token_info) {
   int mi_rows_aligned_to_sb =
-      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params.mib_size_log2);
-  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+      ALIGN_POWER_OF_TWO(cm->mi_params.mi_rows, cm->seq_params->mib_size_log2);
+  int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params->mib_size_log2;
   const int num_planes = av1_num_planes(cm);
   unsigned int tokens =
       get_token_alloc(cm->mi_params.mb_rows, cm->mi_params.mb_cols,

diff --git a/av1/encoder/tpl_model.c b/av1/encoder/tpl_model.c
index 2cf5a18..468dfe6 100644
--- a/av1/encoder/tpl_model.c
+++ b/av1/encoder/tpl_model.c

@@ -17,7 +17,6 @@
 #include "config/aom_scale_rtcd.h"
 
 #include "aom/aom_codec.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/av1_common_int.h"
 #include "av1/common/enums.h"
@@ -35,39 +34,48 @@
 #include "av1/encoder/reconinter_enc.h"
 #include "av1/encoder/tpl_model.h"
 
-static AOM_INLINE void tpl_stats_record_txfm_block(TplTxfmStats *tpl_txfm_stats,
-                                                   const tran_low_t *coeff,
-                                                   int coeff_num) {
-  aom_clear_system_state();
+static INLINE double exp_bounded(double v) {
+  // When v > 700 or <-700, the exp function will be close to overflow
+  // For details, see the "Notes" in the following link.
+  // https://en.cppreference.com/w/c/numeric/math/exp
+  if (v > 700) {
+    return DBL_MAX;
+  } else if (v < -700) {
+    return 0;
+  }
+  return exp(v);
+}
+
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats) {
+  tpl_txfm_stats->coeff_num = 256;
+  tpl_txfm_stats->txfm_block_count = 0;
+  memset(tpl_txfm_stats->abs_coeff_sum, 0,
+         sizeof(tpl_txfm_stats->abs_coeff_sum[0]) * tpl_txfm_stats->coeff_num);
+}
+
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats) {
+  accumulated_stats->txfm_block_count += sub_stats->txfm_block_count;
+  for (int i = 0; i < accumulated_stats->coeff_num; ++i) {
+    accumulated_stats->abs_coeff_sum[i] += sub_stats->abs_coeff_sum[i];
+  }
+}
+
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff) {
   // For transform larger than 16x16, the scale of coeff need to be adjusted.
   // It's not LOSSLESS_Q_STEP.
-  assert(coeff_num <= 256);
-  for (int i = 0; i < coeff_num; ++i) {
+  assert(tpl_txfm_stats->coeff_num <= 256);
+  for (int i = 0; i < tpl_txfm_stats->coeff_num; ++i) {
     tpl_txfm_stats->abs_coeff_sum[i] += abs(coeff[i]) / (double)LOSSLESS_Q_STEP;
   }
   ++tpl_txfm_stats->txfm_block_count;
 }
 
-static AOM_INLINE void tpl_stats_update_abs_coeff_mean(
-    TplParams *tpl_data, TplTxfmStats *tpl_txfm_stats) {
-  aom_clear_system_state();
-  TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
-  tpl_frame->txfm_block_count = tpl_txfm_stats->txfm_block_count;
-  for (int i = 0; i < tpl_frame->coeff_num; ++i) {
-    tpl_frame->abs_coeff_sum[i] = tpl_txfm_stats->abs_coeff_sum[i];
-    tpl_frame->abs_coeff_mean[i] =
-        tpl_frame->abs_coeff_sum[i] / tpl_txfm_stats->txfm_block_count;
-  }
-}
-
-void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int tpl_bsize_1d) {
-  aom_clear_system_state();
-  tpl_frame->txfm_block_count = 0;
-  tpl_frame->coeff_num = tpl_bsize_1d * tpl_bsize_1d;
-  memset(tpl_frame->abs_coeff_sum, 0, sizeof(tpl_frame->abs_coeff_sum));
-  assert(sizeof(tpl_frame->abs_coeff_mean) /
-             sizeof(tpl_frame->abs_coeff_mean[0]) ==
-         tpl_frame->coeff_num);
+static AOM_INLINE void av1_tpl_store_txfm_stats(
+    TplParams *tpl_data, const TplTxfmStats *tpl_txfm_stats,
+    const int frame_index) {
+  tpl_data->txfm_stats_list[frame_index] = *tpl_txfm_stats;
 }
 
 static AOM_INLINE void get_quantize_error(const MACROBLOCK *x, int plane,
@@ -119,9 +127,11 @@
   assert(*tpl_bsize_1d >= 16);
 }
 
-void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
-                           int lag_in_frames) {
-  CommonModeInfoParams *const mi_params = &cm->mi_params;
+void av1_setup_tpl_buffers(AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames) {
+  SequenceHeader *const seq_params = &ppi->seq_params;
+  TplParams *const tpl_data = &ppi->tpl_data;
   set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
                            &tpl_data->tpl_bsize_1d);
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
@@ -140,7 +150,6 @@
     tpl_frame->stride = tpl_data->tpl_stats_buffer[frame].width;
     tpl_frame->mi_rows = mi_params->mi_rows;
     tpl_frame->mi_cols = mi_params->mi_cols;
-    av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d);
   }
   tpl_data->tpl_frame = &tpl_data->tpl_stats_buffer[REF_FRAMES + 1];
 
@@ -148,50 +157,34 @@
   // stats buffers are not allocated.
   if (lag_in_frames <= 1) return;
 
-  // TODO(aomedia:2873): Explore the allocation of tpl buffers based on
-  // lag_in_frames.
-  for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
-    CHECK_MEM_ERROR(
-        cm, tpl_data->tpl_stats_pool[frame],
+  for (int frame = 0; frame < lag_in_frames; ++frame) {
+    AOM_CHECK_MEM_ERROR(
+        &ppi->error, tpl_data->tpl_stats_pool[frame],
         aom_calloc(tpl_data->tpl_stats_buffer[frame].width *
                        tpl_data->tpl_stats_buffer[frame].height,
                    sizeof(*tpl_data->tpl_stats_buffer[frame].tpl_stats_ptr)));
-    if (aom_alloc_frame_buffer(
-            &tpl_data->tpl_rec_pool[frame], cm->width, cm->height,
-            cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
-            cm->seq_params.use_highbitdepth, tpl_data->border_in_pixels,
-            cm->features.byte_alignment))
-      aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+
+    if (aom_alloc_frame_buffer(&tpl_data->tpl_rec_pool[frame], width, height,
+                               seq_params->subsampling_x,
+                               seq_params->subsampling_y,
+                               seq_params->use_highbitdepth,
+                               tpl_data->border_in_pixels, byte_alignment))
+      aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR,
                          "Failed to allocate frame buffer");
   }
 }
 
-static AOM_INLINE void tpl_fwd_txfm(const int16_t *src_diff, int bw,
-                                    tran_low_t *coeff, TX_SIZE tx_size,
-                                    int bit_depth, int is_hbd) {
-  TxfmParam txfm_param;
-  txfm_param.tx_type = DCT_DCT;
-  txfm_param.tx_size = tx_size;
-  txfm_param.lossless = 0;
-  txfm_param.tx_set_type = EXT_TX_SET_ALL16;
-
-  txfm_param.bd = bit_depth;
-  txfm_param.is_hbd = is_hbd;
-  av1_fwd_txfm(src_diff, coeff, bw, &txfm_param);
-}
-
-static AOM_INLINE int64_t tpl_get_satd_cost(const MACROBLOCK *x,
+static AOM_INLINE int64_t tpl_get_satd_cost(BitDepthInfo bd_info,
                                             int16_t *src_diff, int diff_stride,
                                             const uint8_t *src, int src_stride,
                                             const uint8_t *dst, int dst_stride,
                                             tran_low_t *coeff, int bw, int bh,
                                             TX_SIZE tx_size) {
-  const MACROBLOCKD *xd = &x->e_mbd;
   const int pix_num = bw * bh;
 
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, bw, coeff, tx_size, xd->bd, is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
   return aom_satd(coeff, pix_num);
 }
 
@@ -199,7 +192,6 @@
   const SCAN_ORDER *const scan_order = &av1_scan_orders[tx_size][DCT_DCT];
 
   assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
-  aom_clear_system_state();
   int rate_cost = 1;
 
   for (int idx = 0; idx < eob; ++idx) {
@@ -216,11 +208,11 @@
     tran_low_t *qcoeff, tran_low_t *dqcoeff, int bw, int bh, TX_SIZE tx_size,
     int *rate_cost, int64_t *recon_error, int64_t *sse) {
   const MACROBLOCKD *xd = &x->e_mbd;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
   uint16_t eob;
-  av1_subtract_block(xd, bh, bw, src_diff, diff_stride, src, src_stride, dst,
-                     dst_stride);
-  tpl_fwd_txfm(src_diff, diff_stride, coeff, tx_size, xd->bd,
-               is_cur_buf_hbd(xd));
+  av1_subtract_block(bd_info, bh, bw, src_diff, diff_stride, src, src_stride,
+                     dst, dst_stride);
+  av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, bw, coeff);
 
   get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &eob, recon_error,
                      sse);
@@ -317,13 +309,17 @@
 }
 
 static void get_rate_distortion(
-    int *rate_cost, int64_t *recon_error, int16_t *src_diff, tran_low_t *coeff,
-    tran_low_t *qcoeff, tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
+    int *rate_cost, int64_t *recon_error, int64_t *pred_error,
+    int16_t *src_diff, tran_low_t *coeff, tran_low_t *qcoeff,
+    tran_low_t *dqcoeff, AV1_COMMON *cm, MACROBLOCK *x,
     const YV12_BUFFER_CONFIG *ref_frame_ptr[2], uint8_t *rec_buffer_pool[3],
     const int rec_stride_pool[3], TX_SIZE tx_size, PREDICTION_MODE best_mode,
-    int mi_row, int mi_col, int use_y_only_rate_distortion) {
+    int mi_row, int mi_col, int use_y_only_rate_distortion,
+    TplTxfmStats *tpl_txfm_stats) {
+  const SequenceHeader *seq_params = cm->seq_params;
   *rate_cost = 0;
   *recon_error = 1;
+  *pred_error = 1;
 
   MACROBLOCKD *xd = &x->e_mbd;
   int is_compound = (best_mode == NEW_NEWMV);
@@ -357,7 +353,8 @@
     for (int ref = 0; ref < 1 + is_compound; ++ref) {
       if (!is_inter_mode(best_mode)) {
         av1_predict_intra_block(
-            cm, xd, block_size_wide[bsize_plane], block_size_high[bsize_plane],
+            xd, seq_params->sb_size, seq_params->enable_intra_edge_filter,
+            block_size_wide[bsize_plane], block_size_high[bsize_plane],
             max_txsize_rect_lookup[bsize_plane], best_mode, 0, 0,
             FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride, dst_buffer,
             dst_buffer_stride, 0, 0, plane);
@@ -405,7 +402,13 @@
         block_size_high[bsize_plane], max_txsize_rect_lookup[bsize_plane],
         &this_rate, &this_recon_error, &sse);
 
+    if (plane == 0 && tpl_txfm_stats) {
+      // We only collect Y plane's transform coefficient
+      av1_record_tpl_txfm_block(tpl_txfm_stats, coeff);
+    }
+
     *recon_error += this_recon_error;
+    *pred_error += sse;
     *rate_cost += this_rate;
   }
 }
@@ -416,12 +419,13 @@
                                        BLOCK_SIZE bsize, TX_SIZE tx_size,
                                        TplDepStats *tpl_stats) {
   AV1_COMMON *cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
 
   (void)gf_group;
 
   MACROBLOCKD *xd = &x->e_mbd;
-  TplParams *tpl_data = &cpi->tpl_data;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  TplParams *tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   const uint8_t block_mis_log2 = tpl_data->tpl_stats_block_mis_log2;
 
@@ -473,6 +477,7 @@
   uint8_t *predictor =
       is_cur_buf_hbd(xd) ? CONVERT_TO_BYTEPTR(predictor8) : predictor8;
   int64_t recon_error = 1;
+  int64_t pred_error = 1;
 
   memset(tpl_stats, 0, sizeof(*tpl_stats));
   tpl_stats->ref_frame_index[0] = -1;
@@ -495,7 +500,6 @@
   // Pre-load the bottom left line.
   if (xd->left_available &&
       mi_row + tx_size_high_unit[tx_size] < xd->tile.mi_row_end) {
-#if CONFIG_AV1_HIGHBITDEPTH
     if (is_cur_buf_hbd(xd)) {
       uint16_t *dst = CONVERT_TO_SHORTPTR(dst_buffer);
       for (int i = 0; i < bw; ++i)
@@ -506,26 +510,24 @@
         dst_buffer[(bw + i) * dst_buffer_stride - 1] =
             dst_buffer[(bw - 1) * dst_buffer_stride - 1];
     }
-#else
-    for (int i = 0; i < bw; ++i)
-      dst_buffer[(bw + i) * dst_buffer_stride - 1] =
-          dst_buffer[(bw - 1) * dst_buffer_stride - 1];
-#endif
   }
 
   // if cpi->sf.tpl_sf.prune_intra_modes is on, then search only DC_PRED,
   // H_PRED, and V_PRED
   const PREDICTION_MODE last_intra_mode =
       cpi->sf.tpl_sf.prune_intra_modes ? D45_PRED : INTRA_MODE_END;
+  const SequenceHeader *seq_params = cm->seq_params;
   for (PREDICTION_MODE mode = INTRA_MODE_START; mode < last_intra_mode;
        ++mode) {
-    av1_predict_intra_block(cm, xd, block_size_wide[bsize],
-                            block_size_high[bsize], tx_size, mode, 0, 0,
-                            FILTER_INTRA_MODES, dst_buffer, dst_buffer_stride,
-                            predictor, bw, 0, 0, 0);
+    av1_predict_intra_block(xd, seq_params->sb_size,
+                            seq_params->enable_intra_edge_filter,
+                            block_size_wide[bsize], block_size_high[bsize],
+                            tx_size, mode, 0, 0, FILTER_INTRA_MODES, dst_buffer,
+                            dst_buffer_stride, predictor, bw, 0, 0, 0);
 
-    intra_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    intra_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
 
     if (intra_cost < best_intra_cost) {
       best_intra_cost = intra_cost;
@@ -609,7 +611,7 @@
       for (idx = 0; idx < refmv_count; ++idx) {
         FULLPEL_MV mv = get_fullmv_from_mv(&center_mvs[idx].mv.as_mv);
         clamp_fullmv(&mv, &x->mv_limits);
-        center_mvs[idx].sad = (int)cpi->fn_ptr[bsize].sdf(
+        center_mvs[idx].sad = (int)cpi->ppi->fn_ptr[bsize].sdf(
             src_mb_buffer, src_stride, &ref_mb[mv.row * ref_stride + mv.col],
             ref_stride);
       }
@@ -655,8 +657,9 @@
     av1_enc_build_one_inter_predictor(predictor, bw, &best_rfidx_mv.as_mv,
                                       &inter_pred_params);
 
-    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
     // Store inter cost for each ref frame
     tpl_stats->pred_error[rf_idx] = AOMMAX(1, inter_cost);
 
@@ -734,8 +737,9 @@
       av1_enc_build_one_inter_predictor(predictor, bw, &tmp_mv[ref].as_mv,
                                         &inter_pred_params);
     }
-    inter_cost = tpl_get_satd_cost(x, src_diff, bw, src_mb_buffer, src_stride,
-                                   predictor, bw, coeff, bw, bh, tx_size);
+    inter_cost =
+        tpl_get_satd_cost(bd_info, src_diff, bw, src_mb_buffer, src_stride,
+                          predictor, bw, coeff, bw, bh, tx_size);
     if (inter_cost < best_inter_cost) {
       best_cmp_rf_idx = cmp_rf_idx;
       best_inter_cost = inter_cost;
@@ -762,10 +766,10 @@
           : NULL,
     };
     int rate_cost = 1;
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   }
 
@@ -774,7 +778,8 @@
   tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
   tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
 
-  tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
+  tpl_stats->srcrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+  tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
 
   // Final encode
   int rate_cost = 0;
@@ -788,18 +793,17 @@
       best_mode == NEW_NEWMV
           ? tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]]
           : NULL;
-  get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                      dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+  get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                      qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                       rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                      use_y_only_rate_distortion);
-
-  tpl_stats_record_txfm_block(tpl_txfm_stats, coeff, tpl_frame->coeff_num);
+                      use_y_only_rate_distortion, tpl_txfm_stats);
 
   tpl_stats->recrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
   tpl_stats->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
   if (!is_inter_mode(best_mode)) {
     tpl_stats->srcrf_dist = recon_error << (TPL_DEP_COST_SCALE_LOG2);
     tpl_stats->srcrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+    tpl_stats->srcrf_sse = pred_error << TPL_DEP_COST_SCALE_LOG2;
   }
 
   tpl_stats->recrf_dist = AOMMAX(tpl_stats->srcrf_dist, tpl_stats->recrf_dist);
@@ -809,10 +813,10 @@
     ref_frame_ptr[0] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
     ref_frame_ptr[1] =
         tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[0] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[0] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
 
@@ -830,10 +834,10 @@
     ref_frame_ptr[0] =
         tpl_data->src_ref_frame[comp_ref_frames[best_cmp_rf_idx][0]];
     ref_frame_ptr[1] = tpl_data->ref_frame[comp_ref_frames[best_cmp_rf_idx][1]];
-    get_rate_distortion(&rate_cost, &recon_error, src_diff, coeff, qcoeff,
-                        dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
+    get_rate_distortion(&rate_cost, &recon_error, &pred_error, src_diff, coeff,
+                        qcoeff, dqcoeff, cm, x, ref_frame_ptr, rec_buffer_pool,
                         rec_stride_pool, tx_size, best_mode, mi_row, mi_col,
-                        use_y_only_rate_distortion);
+                        use_y_only_rate_distortion, NULL);
     tpl_stats->cmp_recrf_dist[1] = recon_error << TPL_DEP_COST_SCALE_LOG2;
     tpl_stats->cmp_recrf_rate[1] = rate_cost << TPL_DEP_COST_SCALE_LOG2;
 
@@ -886,41 +890,24 @@
   return round;
 }
 
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << mi_size_wide_log2[bsize];
-  int bh = 4 << mi_size_high_log2[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height) {
+  int min_row = AOMMAX(row_a, row_b);
+  int max_row = AOMMIN(row_a + height, row_b + height);
+  int min_col = AOMMAX(col_a, col_b);
+  int max_col = AOMMIN(col_a + width, col_b + width);
+  if (min_row < max_row && min_col < max_col) {
+    return (max_row - min_row) * (max_col - min_col);
   }
-  int overlap_area = width * height;
-  return overlap_area;
+  return 0;
 }
 
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift) {
   return (mi_row >> right_shift) * stride + (mi_col >> right_shift);
 }
 
-static int64_t delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
-                               int64_t srcrf_dist, int pix_num) {
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num) {
   double beta = (double)srcrf_dist / recrf_dist;
   int64_t rate_cost = delta_rate;
 
@@ -951,7 +938,6 @@
 static AOM_INLINE void tpl_model_update_b(TplParams *const tpl_data, int mi_row,
                                           int mi_col, const BLOCK_SIZE bsize,
                                           int frame_idx, int ref) {
-  aom_clear_system_state();
   TplDepFrame *tpl_frame_ptr = &tpl_data->tpl_frame[frame_idx];
   TplDepStats *tpl_ptr = tpl_frame_ptr->tpl_stats_ptr;
   TplDepFrame *tpl_frame = tpl_data->tpl_frame;
@@ -997,8 +983,8 @@
                  tpl_stats_ptr->recrf_dist));
   int64_t delta_rate = tpl_stats_ptr->recrf_rate - srcrf_rate;
   int64_t mc_dep_rate =
-      delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
-                      srcrf_dist, pix_num);
+      av1_delta_rate_cost(tpl_stats_ptr->mc_dep_rate, tpl_stats_ptr->recrf_dist,
+                          srcrf_dist, pix_num);
 
   for (block = 0; block < 4; ++block) {
     int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
@@ -1006,8 +992,8 @@
 
     if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
         grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int overlap_area = av1_get_overlap_area(grid_pos_row, grid_pos_col,
+                                              ref_pos_row, ref_pos_col, bw, bh);
       int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
       int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
       assert((1 << block_mis_log2) == mi_height);
@@ -1042,6 +1028,7 @@
   tpl_ptr->intra_cost = AOMMAX(1, tpl_ptr->intra_cost);
   tpl_ptr->inter_cost = AOMMAX(1, tpl_ptr->inter_cost);
   tpl_ptr->srcrf_dist = AOMMAX(1, tpl_ptr->srcrf_dist);
+  tpl_ptr->srcrf_sse = AOMMAX(1, tpl_ptr->srcrf_sse);
   tpl_ptr->recrf_dist = AOMMAX(1, tpl_ptr->recrf_dist);
   tpl_ptr->srcrf_rate = AOMMAX(1, tpl_ptr->srcrf_rate);
   tpl_ptr->recrf_rate = AOMMAX(1, tpl_ptr->recrf_rate);
@@ -1067,12 +1054,12 @@
 // Initialize the mc_flow parameters used in computing tpl data.
 static AOM_INLINE void init_mc_flow_dispenser(AV1_COMP *cpi, int frame_idx,
                                               int pframe_qindex) {
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[frame_idx];
   const YV12_BUFFER_CONFIG *this_frame = tpl_frame->gf_picture;
   const YV12_BUFFER_CONFIG *ref_frames_ordered[INTER_REFS_PER_FRAME];
   uint32_t ref_frame_display_indices[INTER_REFS_PER_FRAME];
-  GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   int ref_pruning_enabled = is_frame_eligible_for_ref_pruning(
       gf_group, cpi->sf.inter_sf.selective_ref_frame,
       cpi->sf.tpl_sf.prune_ref_frames_in_tpl, frame_idx);
@@ -1110,8 +1097,9 @@
   }
 
   // Work out which reference frame slots may be used.
-  ref_frame_flags = get_ref_frame_flags(&cpi->sf, ref_frames_ordered,
-                                        cpi->ext_flags.ref_frame_flags);
+  ref_frame_flags =
+      get_ref_frame_flags(&cpi->sf, is_one_pass_rt_params(cpi),
+                          ref_frames_ordered, cpi->ext_flags.ref_frame_flags);
 
   enforce_max_ref_frames(cpi, &ref_frame_flags, ref_frame_display_indices,
                          tpl_frame->frame_display_index);
@@ -1159,10 +1147,14 @@
   cm->quant_params.base_qindex = base_qindex;
   av1_frame_init_quantizer(cpi);
 
-  tpl_frame->base_rdmult =
-      av1_compute_rd_mult_based_on_qindex(cpi, pframe_qindex) / 6;
+  const BitDepthInfo bd_info = get_bit_depth_info(xd);
+  const FRAME_UPDATE_TYPE update_type =
+      gf_group->update_type[cpi->gf_frame_index];
+  tpl_frame->base_rdmult = av1_compute_rd_mult_based_on_qindex(
+                               bd_info.bit_depth, update_type, pframe_qindex) /
+                           6;
 
-  memset(tpl_txfm_stats, 0, sizeof(*tpl_txfm_stats));
+  av1_init_tpl_txfm_stats(tpl_txfm_stats);
 }
 
 // This function stores the motion estimation dependencies of all the blocks in
@@ -1175,7 +1167,7 @@
   AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const int mi_width = mi_size_wide[bsize];
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   TplDepFrame *tpl_frame = &tpl_data->tpl_frame[tpl_data->frame_idx];
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1214,13 +1206,14 @@
   ThreadData *td = &cpi->td;
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *xd = &x->e_mbd;
-  const BLOCK_SIZE bsize = convert_length_to_bsize(cpi->tpl_data.tpl_bsize_1d);
+  const BLOCK_SIZE bsize =
+      convert_length_to_bsize(cpi->ppi->tpl_data.tpl_bsize_1d);
   const TX_SIZE tx_size = max_txsize_lookup[bsize];
   const int mi_height = mi_size_high[bsize];
   for (int mi_row = 0; mi_row < mi_params->mi_rows; mi_row += mi_height) {
     // Motion estimation row boundary
     av1_set_mv_row_limits(mi_params, &x->mv_limits, mi_row, mi_height,
-                          cpi->tpl_data.border_in_pixels);
+                          cpi->ppi->tpl_data.border_in_pixels);
     xd->mb_to_top_edge = -GET_MV_SUBPEL(mi_row * MI_SIZE);
     xd->mb_to_bottom_edge =
         GET_MV_SUBPEL((mi_params->mi_rows - mi_height - mi_row) * MI_SIZE);
@@ -1229,21 +1222,20 @@
   }
 }
 
-static void mc_flow_synthesizer(AV1_COMP *cpi, int frame_idx) {
-  AV1_COMMON *cm = &cpi->common;
-  TplParams *const tpl_data = &cpi->tpl_data;
-
+static void mc_flow_synthesizer(TplParams *tpl_data, int frame_idx, int mi_rows,
+                                int mi_cols) {
+  if (!frame_idx) {
+    return;
+  }
   const BLOCK_SIZE bsize = convert_length_to_bsize(tpl_data->tpl_bsize_1d);
   const int mi_height = mi_size_high[bsize];
   const int mi_width = mi_size_wide[bsize];
   assert(mi_height == (1 << tpl_data->tpl_stats_block_mis_log2));
   assert(mi_width == (1 << tpl_data->tpl_stats_block_mis_log2));
 
-  for (int mi_row = 0; mi_row < cm->mi_params.mi_rows; mi_row += mi_height) {
-    for (int mi_col = 0; mi_col < cm->mi_params.mi_cols; mi_col += mi_width) {
-      if (frame_idx) {
-        tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
-      }
+  for (int mi_row = 0; mi_row < mi_rows; mi_row += mi_height) {
+    for (int mi_col = 0; mi_col < mi_cols; mi_col += mi_width) {
+      tpl_model_update(tpl_data, mi_row, mi_col, frame_idx);
     }
   }
 }
@@ -1256,16 +1248,23 @@
   int cur_frame_idx = cpi->gf_frame_index;
   *pframe_qindex = 0;
 
+#if CONFIG_FRAME_PARALLEL_ENCODE
+  RefFrameMapPair ref_frame_map_pairs[REF_FRAMES];
+  init_ref_map_pair(cpi, ref_frame_map_pairs);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
   RefBufferStack ref_buffer_stack = cpi->ref_buffer_stack;
+  int remapped_ref_idx[REF_FRAMES];
+
   EncodeFrameParams frame_params = *init_frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
 
   int ref_picture_map[REF_FRAMES];
 
   for (int i = 0; i < REF_FRAMES; ++i) {
     if (frame_params.frame_type == KEY_FRAME) {
       tpl_data->tpl_frame[-i - 1].gf_picture = NULL;
-      tpl_data->tpl_frame[-1 - 1].rec_picture = NULL;
+      tpl_data->tpl_frame[-i - 1].rec_picture = NULL;
       tpl_data->tpl_frame[-i - 1].frame_display_index = 0;
     } else {
       tpl_data->tpl_frame[-i - 1].gf_picture = &cm->ref_frame_map[i]->buf;
@@ -1288,7 +1287,7 @@
     TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_index];
     FRAME_UPDATE_TYPE frame_update_type = gf_group->update_type[gf_index];
     int frame_display_index = gf_index == gf_group->size
-                                  ? cpi->rc.baseline_gf_interval
+                                  ? cpi->ppi->p_rc.baseline_gf_interval
                                   : gf_group->cur_frame_idx[gf_index] +
                                         gf_group->arf_src_offset[gf_index];
 
@@ -1317,7 +1316,7 @@
     }
     if (gop_eval && cpi->rc.frames_since_key > 0 &&
         gf_group->arf_index == gf_index)
-      tpl_frame->gf_picture = &cpi->alt_ref_buffer;
+      tpl_frame->gf_picture = &cpi->ppi->alt_ref_buffer;
 
     // 'cm->current_frame.frame_number' is the display number
     // of the current frame.
@@ -1338,19 +1337,52 @@
       tpl_frame->tpl_stats_ptr = tpl_data->tpl_stats_pool[process_frame_count];
       ++process_frame_count;
     }
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
+    av1_get_ref_frames(&ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                       ref_frame_map_pairs, true_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                       cpi, gf_index, 0,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                       remapped_ref_idx);
+
     int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+        cpi, &frame_params, frame_update_type, gf_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        true_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+        &ref_buffer_stack);
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    // Make the frames marked as is_frame_non_ref to non-reference frames.
+    if (cpi->ppi->gf_group.is_frame_non_ref[gf_index]) refresh_mask = 0;
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
-                             frame_params.show_existing_frame,
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             gf_group->refbuf_state[gf_index],
                              refresh_frame_map_index, &ref_buffer_stack);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
-          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
 
     if (refresh_mask) ref_picture_map[refresh_frame_map_index] = gf_index;
 
@@ -1359,9 +1391,11 @@
 
   if (cpi->rc.frames_since_key == 0) return;
 
+  const int tpl_extend = cpi->oxcf.gf_cfg.lag_in_frames - MAX_GF_INTERVAL;
   int extend_frame_count = 0;
   int extend_frame_length = AOMMIN(
-      MAX_TPL_EXTEND, cpi->rc.frames_to_key - cpi->rc.baseline_gf_interval);
+      tpl_extend, cpi->rc.frames_to_key - cpi->ppi->p_rc.baseline_gf_interval);
+
   int frame_display_index = gf_group->cur_frame_idx[gop_length - 1] +
                             gf_group->arf_src_offset[gop_length - 1] + 1;
 
@@ -1400,18 +1434,44 @@
 
     gf_group->update_type[gf_index] = LF_UPDATE;
     gf_group->q_val[gf_index] = *pframe_qindex;
-
-    av1_get_ref_frames(cpi, &ref_buffer_stack);
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    const int true_disp = (int)(tpl_frame->frame_display_index);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+    av1_get_ref_frames(&ref_buffer_stack,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+                       ref_frame_map_pairs, true_disp,
+#if CONFIG_FRAME_PARALLEL_ENCODE_2
+                       cpi, gf_index, 0,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE_2
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+                       remapped_ref_idx);
     int refresh_mask = av1_get_refresh_frame_flags(
-        cpi, &frame_params, frame_update_type, &ref_buffer_stack);
+        cpi, &frame_params, frame_update_type, gf_index,
+#if CONFIG_FRAME_PARALLEL_ENCODE
+        true_disp, ref_frame_map_pairs,
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+        &ref_buffer_stack);
     int refresh_frame_map_index = av1_get_refresh_ref_frame_map(refresh_mask);
-    av1_update_ref_frame_map(cpi, frame_update_type, frame_params.frame_type,
-                             frame_params.show_existing_frame,
+#if !CONFIG_FRAME_PARALLEL_ENCODE
+    av1_update_ref_frame_map(cpi, frame_update_type,
+                             gf_group->refbuf_state[gf_index],
                              refresh_frame_map_index, &ref_buffer_stack);
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
+
+#if CONFIG_FRAME_PARALLEL_ENCODE
+    if (refresh_frame_map_index < REF_FRAMES &&
+        refresh_frame_map_index != INVALID_IDX) {
+      ref_frame_map_pairs[refresh_frame_map_index].disp_order =
+          AOMMAX(0, true_disp);
+      ref_frame_map_pairs[refresh_frame_map_index].pyr_level =
+          get_true_pyr_level(gf_group->layer_depth[gf_index], true_disp,
+                             cpi->ppi->gf_group.max_layer_depth);
+    }
+#endif  // CONFIG_FRAME_PARALLEL_ENCODE
 
     for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
       tpl_frame->ref_map_index[i - LAST_FRAME] =
-          ref_picture_map[cm->remapped_ref_idx[i - LAST_FRAME]];
+          ref_picture_map[remapped_ref_idx[i - LAST_FRAME]];
 
     tpl_frame->ref_map_index[ALTREF_FRAME - LAST_FRAME] = -1;
     tpl_frame->ref_map_index[LAST3_FRAME - LAST_FRAME] = -1;
@@ -1424,12 +1484,11 @@
     ++extend_frame_count;
     ++frame_display_index;
   }
-
-  av1_get_ref_frames(cpi, &cpi->ref_buffer_stack);
 }
 
 void av1_init_tpl_stats(TplParams *const tpl_data) {
   int frame_idx;
+  tpl_data->ready = 0;
   set_tpl_stats_block_size(&tpl_data->tpl_stats_block_mis_log2,
                            &tpl_data->tpl_bsize_1d);
   for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
@@ -1440,9 +1499,58 @@
                sizeof(*tpl_frame->tpl_stats_ptr));
     tpl_frame->is_valid = 0;
   }
-  for (frame_idx = 0; frame_idx < MAX_LENGTH_TPL_FRAME_STATS; ++frame_idx) {
-    TplDepFrame *tpl_frame = &tpl_data->tpl_stats_buffer[frame_idx];
-    av1_tpl_stats_init_txfm_stats(tpl_frame, tpl_data->tpl_bsize_1d);
+#if CONFIG_BITRATE_ACCURACY
+  tpl_data->estimated_gop_bitrate = 0;
+  tpl_data->actual_gop_bitrate = 0;
+#endif
+}
+
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index) {
+  if (tpl_data->ready == 0) {
+    return 0;
+  }
+  if (gf_frame_index >= MAX_TPL_FRAME_IDX) {
+    assert(gf_frame_index < MAX_TPL_FRAME_IDX && "Invalid gf_frame_index\n");
+    return 0;
+  }
+  return tpl_data->tpl_frame[gf_frame_index].is_valid;
+}
+
+static AOM_INLINE int eval_gop_length(double *beta, int gop_eval) {
+  switch (gop_eval) {
+    case 1:
+      // Allow larger GOP size if the base layer ARF has higher dependency
+      // factor than the intermediate ARF and both ARFs have reasonably high
+      // dependency factors.
+      return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+    case 2:
+      if ((beta[0] >= beta[1] + 0.4) && beta[0] > 1.6)
+        return 1;  // Don't shorten the gf interval
+      else if ((beta[0] < beta[1] + 0.1) || beta[0] <= 1.4)
+        return 0;  // Shorten the gf interval
+      else
+        return 2;  // Cannot decide the gf interval, so redo the
+                   // tpl stats calculation.
+    case 3: return beta[0] > 1.1;
+    default: return 2;
+  }
+}
+
+// TODO(jingning): Restructure av1_rc_pick_q_and_bounds() to narrow down
+// the scope of input arguments.
+void av1_tpl_preload_rc_estimate(AV1_COMP *cpi,
+                                 const EncodeFrameParams *const frame_params) {
+  AV1_COMMON *cm = &cpi->common;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  int bottom_index, top_index;
+  cm->current_frame.frame_type = frame_params->frame_type;
+  for (int gf_index = cpi->gf_frame_index; gf_index < gf_group->size;
+       ++gf_index) {
+    cm->current_frame.frame_type = gf_group->frame_type[gf_index];
+    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
+                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
+    gf_group->q_val[gf_index] = av1_rc_pick_q_and_bounds(
+        cpi, cm->width, cm->height, gf_index, &bottom_index, &top_index);
   }
 }
 
@@ -1455,10 +1563,17 @@
   AV1_COMMON *cm = &cpi->common;
   MultiThreadInfo *const mt_info = &cpi->mt_info;
   AV1TplRowMultiThreadInfo *const tpl_row_mt = &mt_info->tpl_row_mt;
-  GF_GROUP *gf_group = &cpi->gf_group;
-  int bottom_index, top_index;
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
   EncodeFrameParams this_frame_params = *frame_params;
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
+  int approx_gop_eval = (gop_eval > 1);
+  int num_arf_layers = MAX_ARF_LAYERS;
+
+  // When gop_eval is set to 2, tpl stats calculation is done for ARFs from base
+  // layer, (base+1) layer and (base+2) layer. When gop_eval is set to 3,
+  // tpl stats calculation is limited to ARFs from base layer and (base+1)
+  // layer.
+  if (approx_gop_eval) num_arf_layers = (gop_eval == 2) ? 3 : 2;
 
   if (cpi->superres_mode != AOM_SUPERRES_NONE) {
     assert(cpi->superres_mode != AOM_SUPERRES_AUTO);
@@ -1472,17 +1587,10 @@
     cm->current_frame.frame_type = gf_group->frame_type[gf_index];
     av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
                                  gf_group->update_type[gf_index],
-                                 cm->current_frame.frame_type, 0);
+                                 gf_group->refbuf_state[gf_index], 0);
 
     memcpy(&cpi->refresh_frame, &this_frame_params.refresh_frame,
            sizeof(cpi->refresh_frame));
-
-    cm->show_frame = gf_group->update_type[gf_index] != ARF_UPDATE &&
-                     gf_group->update_type[gf_index] != INTNL_ARF_UPDATE;
-
-    gf_group->q_val[gf_index] =
-        av1_rc_pick_q_and_bounds(cpi, &cpi->rc, cm->width, cm->height, gf_index,
-                                 &bottom_index, &top_index);
   }
 
   int pframe_qindex;
@@ -1490,7 +1598,7 @@
   init_gop_frames_for_tpl(cpi, frame_params, gf_group, gop_eval,
                           &tpl_gf_group_frames, frame_input, &pframe_qindex);
 
-  cpi->rc.base_layer_qp = pframe_qindex;
+  cpi->ppi->p_rc.base_layer_qp = pframe_qindex;
 
   av1_init_tpl_stats(tpl_data);
 
@@ -1506,6 +1614,7 @@
   av1_fill_mv_costs(&cm->fc->nmvc, cm->features.cur_frame_force_integer_mv,
                     cm->features.allow_high_precision_mv, cpi->td.mb.mv_costs);
 
+  const int gop_length = get_gop_length(gf_group);
   // Backward propagation from tpl_group_frames to 1.
   for (int frame_idx = cpi->gf_frame_index; frame_idx < tpl_gf_group_frames;
        ++frame_idx) {
@@ -1513,6 +1622,12 @@
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
+    // When approx_gop_eval = 1, skip tpl stats calculation for higher layer
+    // frames and for frames beyond gop length.
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
     init_mc_flow_dispenser(cpi, frame_idx, pframe_qindex);
     if (mt_info->num_workers > 1) {
       tpl_row_mt->sync_read_ptr = av1_tpl_row_mt_sync_read;
@@ -1521,7 +1636,7 @@
     } else {
       mc_flow_dispenser(cpi);
     }
-    tpl_stats_update_abs_coeff_mean(tpl_data, &cpi->td.tpl_txfm_stats);
+    av1_tpl_store_txfm_stats(tpl_data, &cpi->td.tpl_txfm_stats, frame_idx);
 
     aom_extend_frame_borders(tpl_data->tpl_frame[frame_idx].rec_picture,
                              av1_num_planes(cm));
@@ -1533,12 +1648,17 @@
         gf_group->update_type[frame_idx] == OVERLAY_UPDATE)
       continue;
 
-    mc_flow_synthesizer(cpi, frame_idx);
+    if (approx_gop_eval && (gf_group->layer_depth[frame_idx] > num_arf_layers ||
+                            frame_idx >= gop_length))
+      continue;
+
+    mc_flow_synthesizer(tpl_data, frame_idx, cm->mi_params.mi_rows,
+                        cm->mi_params.mi_cols);
   }
 
   av1_configure_buffer_updates(cpi, &this_frame_params.refresh_frame,
                                gf_group->update_type[cpi->gf_frame_index],
-                               frame_params->frame_type, 0);
+                               gf_group->update_type[cpi->gf_frame_index], 0);
   cm->current_frame.frame_type = frame_params->frame_type;
   cm->show_frame = frame_params->show_frame;
 
@@ -1549,6 +1669,9 @@
     end_timing(cpi, av1_tpl_setup_stats_time);
 #endif
 
+  if (!approx_gop_eval) {
+    tpl_data->ready = 1;
+  }
   if (cpi->common.tiles.large_scale) return 0;
   if (gf_group->max_layer_depth_allowed == 0) return 1;
   if (!gop_eval) return 0;
@@ -1594,20 +1717,17 @@
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, av1_tpl_setup_stats_time);
 #endif
-
-  // Allow larger GOP size if the base layer ARF has higher dependency factor
-  // than the intermediate ARF and both ARFs have reasonably high dependency
-  // factors.
-  return (beta[0] >= beta[1] + 0.7) && beta[0] > 8.0;
+  return eval_gop_length(beta, gop_eval);
 }
 
 void av1_tpl_rdmult_setup(AV1_COMP *cpi) {
   const AV1_COMMON *const cm = &cpi->common;
   const int tpl_idx = cpi->gf_frame_index;
 
-  assert(IMPLIES(cpi->gf_group.size > 0, tpl_idx < cpi->gf_group.size));
+  assert(
+      IMPLIES(cpi->ppi->gf_group.size > 0, tpl_idx < cpi->ppi->gf_group.size));
 
-  TplParams *const tpl_data = &cpi->tpl_data;
+  TplParams *const tpl_data = &cpi->ppi->tpl_data;
   const TplDepFrame *const tpl_frame = &tpl_data->tpl_frame[tpl_idx];
 
   if (!tpl_frame->is_valid) return;
@@ -1624,8 +1744,6 @@
   const double c = 1.2;
   const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
 
-  aom_clear_system_state();
-
   // Loop through each 'block_size' X 'block_size' block.
   for (int row = 0; row < num_rows; row++) {
     for (int col = 0; col < num_cols; col++) {
@@ -1648,23 +1766,22 @@
       }
       const double rk = intra_cost / mc_dep_cost;
       const int index = row * num_cols + col;
-      cpi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
+      cpi->ppi->tpl_rdmult_scaling_factors[index] = rk / cpi->rd.r0 + c;
     }
   }
-  aom_clear_system_state();
 }
 
 void av1_tpl_rdmult_setup_sb(AV1_COMP *cpi, MACROBLOCK *const x,
                              BLOCK_SIZE sb_size, int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  GF_GROUP *gf_group = &cpi->gf_group;
-  assert(IMPLIES(cpi->gf_group.size > 0,
-                 cpi->gf_frame_index < cpi->gf_group.size));
+  GF_GROUP *gf_group = &cpi->ppi->gf_group;
+  assert(IMPLIES(cpi->ppi->gf_group.size > 0,
+                 cpi->gf_frame_index < cpi->ppi->gf_group.size));
   const int tpl_idx = cpi->gf_frame_index;
 
   if (tpl_idx >= MAX_TPL_FRAME_IDX) return;
-  TplDepFrame *tpl_frame = &cpi->tpl_data.tpl_frame[tpl_idx];
-  if (tpl_frame->is_valid == 0) return;
+  TplDepFrame *tpl_frame = &cpi->ppi->tpl_data.tpl_frame[tpl_idx];
+  if (!tpl_frame->is_valid) return;
   if (!is_frame_tpl_eligible(gf_group, cpi->gf_frame_index)) return;
   if (cpi->oxcf.q_cfg.aq_mode != NO_AQ) return;
 
@@ -1686,13 +1803,12 @@
   double base_block_count = 0.0;
   double log_sum = 0.0;
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
          col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      log_sum += log(cpi->tpl_rdmult_scaling_factors[index]);
+      log_sum += log(cpi->ppi->tpl_rdmult_scaling_factors[index]);
       base_block_count += 1.0;
     }
   }
@@ -1706,33 +1822,30 @@
   const double scaling_factor = (double)new_rdmult / (double)orig_rdmult;
 
   double scale_adj = log(scaling_factor) - log_sum / base_block_count;
-  scale_adj = exp(scale_adj);
+  scale_adj = exp_bounded(scale_adj);
 
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col_sr / num_mi_h;
          col < num_cols && col < mi_col_sr / num_mi_h + num_bcols; ++col) {
       const int index = row * num_cols + col;
-      cpi->tpl_sb_rdmult_scaling_factors[index] =
-          scale_adj * cpi->tpl_rdmult_scaling_factors[index];
+      cpi->ppi->tpl_sb_rdmult_scaling_factors[index] =
+          scale_adj * cpi->ppi->tpl_rdmult_scaling_factors[index];
     }
   }
-  aom_clear_system_state();
 }
 
-#define EPSILON (0.0000001)
-
 double av1_exponential_entropy(double q_step, double b) {
-  aom_clear_system_state();
-  double z = fmax(exp(-q_step / b), EPSILON);
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
   return -log2(1 - z) - z * log2(z) / (1 - z);
 }
 
 double av1_laplace_entropy(double q_step, double b, double zero_bin_ratio) {
-  aom_clear_system_state();
   // zero bin's size is zero_bin_ratio * q_step
   // non-zero bin's size is q_step
-  double z = fmax(exp(-zero_bin_ratio / 2 * q_step / b), EPSILON);
+  b = AOMMAX(b, TPL_EPSILON);
+  double z = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
   double h = av1_exponential_entropy(q_step, b);
   double r = -(1 - z) * log2(1 - z) - z * log2(z) + z * (h + 1);
   return r;
@@ -1741,7 +1854,6 @@
 double av1_laplace_estimate_frame_rate(int q_index, int block_count,
                                        const double *abs_coeff_mean,
                                        int coeff_num) {
-  aom_clear_system_state();
   double zero_bin_ratio = 2;
   double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
   double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
@@ -1756,3 +1868,354 @@
   est_rate *= block_count;
   return est_rate;
 }
+
+double av1_estimate_gop_bitrate(const int *q_index_list, const int frame_count,
+                                const TplTxfmStats *stats_list,
+                                const int *stats_valid_list,
+                                double *bitrate_byframe_list) {
+  double gop_bitrate = 0;
+  for (int frame_index = 0; frame_index < frame_count; frame_index++) {
+    if (stats_valid_list[frame_index]) {
+      int q_index = q_index_list[frame_index];
+      TplTxfmStats frame_stats = stats_list[frame_index];
+
+      /* Convert to mean absolute deviation */
+      double abs_coeff_mean[256] = { 0 };
+      for (int i = 0; i < 256; i++) {
+        abs_coeff_mean[i] =
+            frame_stats.abs_coeff_sum[i] / frame_stats.txfm_block_count;
+      }
+
+      double frame_bitrate = av1_laplace_estimate_frame_rate(
+          q_index, frame_stats.txfm_block_count, abs_coeff_mean, 256);
+      gop_bitrate += frame_bitrate;
+
+      if (bitrate_byframe_list != NULL) {
+        bitrate_byframe_list[frame_index] = frame_bitrate;
+      }
+    }
+  }
+  return gop_bitrate;
+}
+
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff) {
+  b = AOMMAX(b, TPL_EPSILON);
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp_bounded(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double r = -log2(1 - z0);
+    return r;
+  } else {
+    double z = fmax(exp_bounded(-q_step / b), TPL_EPSILON);
+    double r = 1 - log2(z0) - log2(1 - z) - (abs_qcoeff - 1) * log2(z);
+    return r;
+  }
+}
+
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num) {
+  double zero_bin_ratio = 2;
+  double dc_q_step = av1_dc_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double ac_q_step = av1_ac_quant_QTX(q_index, 0, AOM_BITS_8) / 4.;
+  double est_rate = 0;
+  // dc coeff
+  est_rate += av1_estimate_coeff_entropy(dc_q_step, abs_coeff_mean[0],
+                                         zero_bin_ratio, qcoeff_arr[0]);
+  // ac coeff
+  for (int i = 1; i < coeff_num; ++i) {
+    est_rate += av1_estimate_coeff_entropy(ac_q_step, abs_coeff_mean[i],
+                                           zero_bin_ratio, qcoeff_arr[i]);
+  }
+  return est_rate;
+}
+
+#if CONFIG_RD_COMMAND
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command) {
+  FILE *fptr = fopen(filepath, "r");
+  fscanf(fptr, "%d", &rd_command->frame_count);
+  rd_command->frame_index = 0;
+  for (int i = 0; i < rd_command->frame_count; ++i) {
+    int option;
+    fscanf(fptr, "%d", &option);
+    rd_command->option_ls[i] = (RD_OPTION)option;
+    if (option == RD_OPTION_SET_Q) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+    } else if (option == RD_OPTION_SET_Q_RDMULT) {
+      fscanf(fptr, "%d", &rd_command->q_index_ls[i]);
+      fscanf(fptr, "%d", &rd_command->rdmult_ls[i]);
+    }
+  }
+  fclose(fptr);
+}
+#endif  // CONFIG_RD_COMMAND
+
+void get_tpl_stats_valid_list(const TplParams *tpl_data, int gop_size,
+                              int *stats_valid_list) {
+  for (int i = 0; i < gop_size; ++i) {
+    stats_valid_list[i] = av1_tpl_stats_ready(tpl_data, i);
+  }
+}
+
+/*
+ * Estimate the optimal base q index for a GOP.
+ */
+int av1_q_mode_estimate_base_q(const GF_GROUP *gf_group,
+                               const TplTxfmStats *txfm_stats_list,
+                               const int *stats_valid_list, double bit_budget,
+                               int gf_frame_index, aom_bit_depth_t bit_depth,
+                               double scale_factor,
+                               const double *qstep_ratio_list,
+                               int *q_index_list,
+                               double *estimated_bitrate_byframe) {
+  int q_max = 255;  // Maximum q value.
+  int q_min = 0;    // Minimum q value.
+  int q = (q_max + q_min) / 2;
+
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_max, qstep_ratio_list,
+                                   bit_depth, gf_group, q_index_list);
+  double q_max_estimate = av1_estimate_gop_bitrate(
+      q_index_list, gf_group->size, txfm_stats_list, stats_valid_list, NULL);
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q_min, qstep_ratio_list,
+                                   bit_depth, gf_group, q_index_list);
+  double q_min_estimate = av1_estimate_gop_bitrate(
+      q_index_list, gf_group->size, txfm_stats_list, stats_valid_list, NULL);
+
+  while (true) {
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
+                                     bit_depth, gf_group, q_index_list);
+
+    double estimate = av1_estimate_gop_bitrate(
+        q_index_list, gf_group->size, txfm_stats_list, stats_valid_list, NULL);
+
+    estimate *= scale_factor;
+
+    // We want to find the lowest q that satisfies the bit budget constraint.
+    // A binary search narrows the result down to two values: q_min and q_max.
+    if (q_max <= q_min + 1 || estimate == bit_budget) {
+      // Pick the estimate that lands closest to the budget.
+      if (fabs(q_max_estimate - bit_budget) <
+          fabs(q_min_estimate - bit_budget)) {
+        q = q_max;
+      } else {
+        q = q_min;
+      }
+      break;
+    } else if (estimate > bit_budget) {
+      q_min = q;
+      q_min_estimate = estimate;
+      q = (q_max + q_min) / 2;
+    } else if (estimate < bit_budget) {
+      q_max = q;
+      q_max_estimate = estimate;
+      q = (q_max + q_min) / 2;
+    }
+  }
+
+  // Update q_index_list and vbr_rc_info.
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
+                                   bit_depth, gf_group, q_index_list);
+  av1_estimate_gop_bitrate(q_index_list, gf_group->size, txfm_stats_list,
+                           stats_valid_list, estimated_bitrate_byframe);
+  return q;
+}
+
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index) {
+  if (!av1_tpl_stats_ready(tpl_data, gf_frame_index)) {
+    return 1;
+  }
+
+  const TplDepFrame *tpl_frame = &tpl_data->tpl_frame[gf_frame_index];
+  const TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+  const int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost_base = 0;
+  int64_t mc_dep_cost_base = 0;
+  const int step = 1 << tpl_data->tpl_stats_block_mis_log2;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      const TplDepStats *this_stats = &tpl_stats[av1_tpl_ptr_pos(
+          row, col, tpl_stride, tpl_data->tpl_stats_block_mis_log2)];
+      const int64_t mc_dep_delta =
+          RDCOST(tpl_frame->base_rdmult, this_stats->mc_dep_rate,
+                 this_stats->mc_dep_dist);
+      intra_cost_base += (this_stats->recrf_dist << RDDIV_BITS);
+      mc_dep_cost_base += (this_stats->recrf_dist << RDDIV_BITS) + mc_dep_delta;
+    }
+  }
+  const double r0 = (double)intra_cost_base / mc_dep_cost_base;
+  return sqrt(r0);
+}
+
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth) {
+  const double leaf_qstep = av1_dc_quant_QTX(leaf_qindex, 0, bit_depth);
+  const double target_qstep = leaf_qstep * qstep_ratio;
+  int qindex = leaf_qindex;
+  for (qindex = leaf_qindex; qindex > 0; --qindex) {
+    const double qstep = av1_dc_quant_QTX(qindex, 0, bit_depth);
+    if (qstep + 0.1 <= target_qstep) break;
+  }
+  return qindex;
+}
+
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth) {
+  const double qstep_ratio = av1_tpl_get_qstep_ratio(tpl_data, gf_frame_index);
+  return av1_get_q_index_from_qstep_ratio(leaf_qindex, qstep_ratio, bit_depth);
+}
+
+#if CONFIG_BITRATE_ACCURACY
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const GF_GROUP *gf_group,
+                                    int gf_frame_index,
+                                    aom_bit_depth_t bit_depth) {
+  // We always update q_index_list when gf_frame_index is zero.
+  // This will make the q indices for the entire gop more consistent
+  if (gf_frame_index == 0) {
+    vbr_rc_info->q_index_list_ready = 1;
+    double gop_bit_budget = vbr_rc_info->gop_bit_budget;
+
+    for (int i = gf_frame_index; i < gf_group->size; i++) {
+      vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+    }
+
+    // We update the q indices in vbr_rc_info in vbr_rc_info->q_index_list
+    // rather than gf_group->q_val to avoid conflicts with the existing code.
+    int stats_valid_list[MAX_LENGTH_TPL_FRAME_STATS] = { 0 };
+    get_tpl_stats_valid_list(tpl_data, gf_group->size, stats_valid_list);
+
+    double mv_bits = av1_tpl_compute_mv_bits(
+        tpl_data, gf_group->size, gf_frame_index,
+        gf_group->update_type[gf_frame_index], vbr_rc_info);
+
+    mv_bits = AOMMIN(mv_bits, 0.6 * gop_bit_budget);
+    gop_bit_budget -= mv_bits;
+
+    double scale_factor =
+        vbr_rc_info->scale_factors[gf_group->update_type[gf_frame_index]];
+
+    vbr_rc_info->base_q_index = av1_q_mode_estimate_base_q(
+        gf_group, tpl_data->txfm_stats_list, stats_valid_list, gop_bit_budget,
+        gf_frame_index, bit_depth, scale_factor, vbr_rc_info->qstep_ratio_list,
+        vbr_rc_info->q_index_list, vbr_rc_info->estimated_bitrate_byframe);
+  } else if (gf_frame_index == 1) {
+    for (int i = gf_frame_index; i < gf_group->size; i++) {
+      vbr_rc_info->qstep_ratio_list[i] = av1_tpl_get_qstep_ratio(tpl_data, i);
+    }
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, vbr_rc_info->base_q_index,
+                                     vbr_rc_info->qstep_ratio_list, bit_depth,
+                                     gf_group, vbr_rc_info->q_index_list);
+  }
+}
+
+/* For a GOP, calculate the bits used by motion vectors. */
+double av1_tpl_compute_mv_bits(const TplParams *tpl_data, int gf_group_size,
+                               int gf_frame_index, int gf_update_type,
+                               VBR_RATECTRL_INFO *vbr_rc_info) {
+  double total_mv_bits = 0;
+
+  // Loop through each frame.
+  for (int i = gf_frame_index; i < gf_group_size; i++) {
+    if (av1_tpl_stats_ready(tpl_data, i)) {
+      TplDepFrame *tpl_frame = &tpl_data->tpl_frame[i];
+      double frame_mv_bits = av1_tpl_compute_frame_mv_entropy(
+          tpl_frame, tpl_data->tpl_stats_block_mis_log2);
+      total_mv_bits += frame_mv_bits;
+      vbr_rc_info->estimated_mv_bitrate_byframe[i] = frame_mv_bits;
+    } else {
+      vbr_rc_info->estimated_mv_bitrate_byframe[i] = 0;
+    }
+  }
+
+  // Scale the final result by the scale factor.
+  return total_mv_bits * vbr_rc_info->mv_scale_factors[gf_update_type];
+}
+#endif  // CONFIG_BITRATE_ACCURACY
+
+// Use upper and left neighbor block as the reference MVs.
+// Compute the minimum difference between current MV and reference MV.
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift) {
+  const TplDepStats *tpl_stats =
+      &tpl_frame
+           ->tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_stride, right_shift)];
+  int_mv current_mv = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+  int current_mv_magnitude =
+      abs(current_mv.as_mv.row) + abs(current_mv.as_mv.col);
+
+  // Retrieve the up and left neighbors.
+  int up_error = INT_MAX;
+  int_mv up_mv_diff;
+  if (row - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row - step, col, tpl_stride, right_shift)];
+    up_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    up_mv_diff.as_mv.row = current_mv.as_mv.row - up_mv_diff.as_mv.row;
+    up_mv_diff.as_mv.col = current_mv.as_mv.col - up_mv_diff.as_mv.col;
+    up_error = abs(up_mv_diff.as_mv.row) + abs(up_mv_diff.as_mv.col);
+  }
+
+  int left_error = INT_MAX;
+  int_mv left_mv_diff;
+  if (col - step >= 0) {
+    tpl_stats = &tpl_frame->tpl_stats_ptr[av1_tpl_ptr_pos(
+        row, col - step, tpl_stride, right_shift)];
+    left_mv_diff = tpl_stats->mv[tpl_stats->ref_frame_index[0]];
+    left_mv_diff.as_mv.row = current_mv.as_mv.row - left_mv_diff.as_mv.row;
+    left_mv_diff.as_mv.col = current_mv.as_mv.col - left_mv_diff.as_mv.col;
+    left_error = abs(left_mv_diff.as_mv.row) + abs(left_mv_diff.as_mv.col);
+  }
+
+  // Return the MV with the minimum distance from current.
+  if (up_error < left_error && up_error < current_mv_magnitude) {
+    return up_mv_diff;
+  } else if (left_error < up_error && left_error < current_mv_magnitude) {
+    return left_mv_diff;
+  }
+  return current_mv;
+}
+
+/* Compute the entropy of motion vectors for a single frame. */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift) {
+  if (!tpl_frame->is_valid) {
+    return 0;
+  }
+
+  int count_row[500] = { 0 };
+  int count_col[500] = { 0 };
+  int n = 0;  // number of MVs to process
+
+  const int tpl_stride = tpl_frame->stride;
+  const int step = 1 << right_shift;
+
+  for (int row = 0; row < tpl_frame->mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame->mi_cols; col += step) {
+      int_mv mv = av1_compute_mv_difference(tpl_frame, row, col, step,
+                                            tpl_stride, right_shift);
+      count_row[clamp(mv.as_mv.row, 0, 499)] += 1;
+      count_col[clamp(mv.as_mv.row, 0, 499)] += 1;
+      n += 1;
+    }
+  }
+
+  // Estimate the bits used using the entropy formula.
+  double rate_row = 0;
+  double rate_col = 0;
+  for (int i = 0; i < 500; i++) {
+    if (count_row[i] != 0) {
+      double p = count_row[i] / (double)n;
+      rate_row += count_row[i] * -log2(p);
+    }
+    if (count_col[i] != 0) {
+      double p = count_col[i] / (double)n;
+      rate_col += count_col[i] * -log2(p);
+    }
+  }
+
+  return rate_row + rate_col;
+}

diff --git a/av1/encoder/tpl_model.h b/av1/encoder/tpl_model.h
index 49f1605..1cd6a6d 100644
--- a/av1/encoder/tpl_model.h
+++ b/av1/encoder/tpl_model.h

@@ -18,11 +18,22 @@
 
 /*!\cond */
 
+struct AV1_PRIMARY;
 struct AV1_COMP;
+struct AV1_SEQ_CODING_TOOLS;
 struct EncodeFrameParams;
 struct EncodeFrameInput;
+struct GF_GROUP;
 
-#include "av1/encoder/encoder.h"
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
 
 static INLINE BLOCK_SIZE convert_length_to_bsize(int length) {
   switch (length) {
@@ -79,12 +90,14 @@
 // The first REF_FRAMES + 1 buffers are reserved.
 // tpl_data->tpl_frame starts after REF_FRAMES + 1
 #define MAX_LENGTH_TPL_FRAME_STATS (MAX_TPL_FRAME_IDX + REF_FRAMES + 1)
-#define MAX_TPL_EXTEND (MAX_LAG_BUFFERS - MAX_GF_INTERVAL)
 #define TPL_DEP_COST_SCALE_LOG2 4
 
+#define TPL_EPSILON 0.0000001
+
 typedef struct TplTxfmStats {
   double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
   int txfm_block_count;
+  int coeff_num;
 } TplTxfmStats;
 
 typedef struct TplDepStats {
@@ -95,6 +108,7 @@
   int64_t cmp_recrf_dist[2];
   int64_t srcrf_rate;
   int64_t recrf_rate;
+  int64_t srcrf_sse;
   int64_t cmp_recrf_rate[2];
   int64_t mc_dep_rate;
   int64_t mc_dep_dist;
@@ -116,10 +130,6 @@
   int mi_cols;
   int base_rdmult;
   uint32_t frame_display_index;
-  double abs_coeff_sum[256];  // Assume we are using 16x16 transform block
-  double abs_coeff_mean[256];
-  int coeff_num;  // number of coefficients in a transform block
-  int txfm_block_count;
 } TplDepFrame;
 
 /*!\endcond */
@@ -128,6 +138,11 @@
  */
 typedef struct TplParams {
   /*!
+   * Whether the tpl stats is ready.
+   */
+  int ready;
+
+  /*!
    * Block granularity of tpl score storage.
    */
   uint8_t tpl_stats_block_mis_log2;
@@ -152,6 +167,12 @@
   TplDepStats *tpl_stats_pool[MAX_LAG_BUFFERS];
 
   /*!
+   * Buffer to store tpl transform stats per frame.
+   * txfm_stats_list[i] stores the TplTxfmStats of the ith frame in a gf group.
+   */
+  TplTxfmStats txfm_stats_list[MAX_LENGTH_TPL_FRAME_STATS];
+
+  /*!
    * Buffer to store tpl reconstructed frame.
    * tpl_rec_pool[i] stores the reconstructed frame of ith frame in a gf group.
    */
@@ -197,12 +218,128 @@
    */
   int border_in_pixels;
 
-  /*!
-   * Skip tpl setup when tpl data from gop length decision can be reused.
+#if CONFIG_BITRATE_ACCURACY
+  /*
+   * Estimated and actual GOP bitrate.
    */
-  int skip_tpl_setup_stats;
+  double estimated_gop_bitrate;
+  double actual_gop_bitrate;
+#endif
 } TplParams;
 
+#if CONFIG_BITRATE_ACCURACY
+/*!
+ * \brief This structure stores information needed for bitrate accuracy
+ * experiment.
+ */
+typedef struct {
+  double keyframe_bitrate;
+  double total_bit_budget;  // The total bit budget of the entire video
+  int show_frame_count;     // Number of show frames in the entire video
+
+  int gop_showframe_count;  // The number of show frames in the current gop
+  double gop_bit_budget;    // The bitbudget for the current gop
+  double scale_factors[FRAME_UPDATE_TYPES];     // Scale factors to improve the
+                                                // budget estimation
+  double mv_scale_factors[FRAME_UPDATE_TYPES];  // Scale factors to improve
+                                                // MV entropy estimation
+
+  // === Below this line are GOP related data that will be updated per GOP ===
+  int base_q_index;  // Stores the base q index.
+  int q_index_list_ready;
+  int q_index_list[MAX_LENGTH_TPL_FRAME_STATS];  // q indices for the current
+                                                 // GOP
+  // Arrays to store frame level bitrate accuracy data.
+  double estimated_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+  double estimated_mv_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+  int actual_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+  int actual_mv_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+  int actual_coeff_bitrate_byframe[MAX_LENGTH_TPL_FRAME_STATS];
+
+  // Array to store qstep_ratio for each frame in a GOP
+  double qstep_ratio_list[MAX_LENGTH_TPL_FRAME_STATS];
+} VBR_RATECTRL_INFO;
+
+static INLINE void vbr_rc_reset_gop_data(VBR_RATECTRL_INFO *vbr_rc_info) {
+  vbr_rc_info->q_index_list_ready = 0;
+  av1_zero(vbr_rc_info->q_index_list);
+  av1_zero(vbr_rc_info->estimated_bitrate_byframe);
+  av1_zero(vbr_rc_info->estimated_mv_bitrate_byframe);
+  av1_zero(vbr_rc_info->actual_bitrate_byframe);
+  av1_zero(vbr_rc_info->actual_mv_bitrate_byframe);
+  av1_zero(vbr_rc_info->actual_coeff_bitrate_byframe);
+}
+
+static INLINE void vbr_rc_init(VBR_RATECTRL_INFO *vbr_rc_info,
+                               double total_bit_budget, int show_frame_count) {
+  vbr_rc_info->total_bit_budget = total_bit_budget;
+  vbr_rc_info->show_frame_count = show_frame_count;
+  vbr_rc_info->keyframe_bitrate = 0;
+  const double scale_factors[FRAME_UPDATE_TYPES] = { 0.94559, 0.12040, 1,
+                                                     1.10199, 1,       1,
+                                                     0.16393 };
+  const double mv_scale_factors[FRAME_UPDATE_TYPES] = { 3, 3, 3, 3, 3, 3, 3 };
+  memcpy(vbr_rc_info->scale_factors, scale_factors,
+         sizeof(scale_factors[0]) * FRAME_UPDATE_TYPES);
+  memcpy(vbr_rc_info->mv_scale_factors, mv_scale_factors,
+         sizeof(mv_scale_factors[0]) * FRAME_UPDATE_TYPES);
+
+  vbr_rc_reset_gop_data(vbr_rc_info);
+}
+
+static INLINE void vbr_rc_set_gop_bit_budget(VBR_RATECTRL_INFO *vbr_rc_info,
+                                             int gop_showframe_count) {
+  vbr_rc_info->gop_showframe_count = gop_showframe_count;
+  vbr_rc_info->gop_bit_budget = vbr_rc_info->total_bit_budget *
+                                gop_showframe_count /
+                                vbr_rc_info->show_frame_count;
+}
+
+static INLINE void vbr_rc_set_keyframe_bitrate(VBR_RATECTRL_INFO *vbr_rc_info,
+                                               double keyframe_bitrate) {
+  vbr_rc_info->keyframe_bitrate = keyframe_bitrate;
+}
+
+static INLINE void vbr_rc_info_log(const VBR_RATECTRL_INFO *vbr_rc_info,
+                                   int gf_frame_index, int gf_group_size,
+                                   FRAME_UPDATE_TYPE *update_type) {
+  // Add +2 here because this is the last frame this method is called at.
+  if (gf_frame_index + 2 >= gf_group_size) {
+    printf(
+        "\ni, \test_bitrate, \test_mv_bitrate, \tact_bitrate, "
+        "\tact_mv_bitrate, \tact_coeff_bitrate, \tq, \tupdate_type\n");
+    for (int i = 0; i < gf_group_size; i++) {
+      printf("%d, \t%f, \t%f, \t%d, \t%d, \t%d, \t%d, \t%d\n", i,
+             vbr_rc_info->estimated_bitrate_byframe[i],
+             vbr_rc_info->estimated_mv_bitrate_byframe[i],
+             vbr_rc_info->actual_bitrate_byframe[i],
+             vbr_rc_info->actual_mv_bitrate_byframe[i],
+             vbr_rc_info->actual_coeff_bitrate_byframe[i],
+             vbr_rc_info->q_index_list[i], update_type[i]);
+    }
+  }
+}
+
+#endif  // CONFIG_BITRATE_ACCURACY
+
+#if CONFIG_RD_COMMAND
+typedef enum {
+  RD_OPTION_NONE,
+  RD_OPTION_SET_Q,
+  RD_OPTION_SET_Q_RDMULT
+} RD_OPTION;
+
+typedef struct RD_COMMAND {
+  RD_OPTION option_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int q_index_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int rdmult_ls[MAX_LENGTH_TPL_FRAME_STATS];
+  int frame_count;
+  int frame_index;
+} RD_COMMAND;
+
+void av1_read_rd_command(const char *filepath, RD_COMMAND *rd_command);
+#endif  // CONFIG_RD_COMMAND
+
 /*!\brief Allocate buffers used by tpl model
  *
  * \param[in]    Top-level encode/decode structure
@@ -211,8 +348,9 @@
  * \param[out]   tpl_data  tpl data structure
  */
 
-void av1_setup_tpl_buffers(AV1_COMMON *const cm, TplParams *const tpl_data,
-                           int lag_in_frames);
+void av1_setup_tpl_buffers(struct AV1_PRIMARY *const ppi,
+                           CommonModeInfoParams *const mi_params, int width,
+                           int height, int byte_alignment, int lag_in_frames);
 
 /*!\brief Implements temporal dependency modelling for a GOP (GF/ARF
  * group) and selects between 16 and 32 frame GOP structure.
@@ -232,10 +370,15 @@
 
 /*!\cond */
 
+void av1_tpl_preload_rc_estimate(
+    struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
 int av1_tpl_ptr_pos(int mi_row, int mi_col, int stride, uint8_t right_shift);
 
 void av1_init_tpl_stats(TplParams *const tpl_data);
 
+int av1_tpl_stats_ready(const TplParams *tpl_data, int gf_frame_index);
+
 void av1_tpl_rdmult_setup(struct AV1_COMP *cpi);
 
 void av1_tpl_rdmult_setup_sb(struct AV1_COMP *cpi, MACROBLOCK *const x,
@@ -277,7 +420,7 @@
 /*!\brief  Compute the frame rate using transform block stats
  *
  * Assume each position i in the transform block is of Laplace distribution
- * with maximum absolute deviation abs_coeff_mean[i]
+ * with mean absolute deviation abs_coeff_mean[i]
  *
  * Then we can use av1_laplace_entropy() to compute the expected frame
  * rate.
@@ -286,7 +429,7 @@
  *
  * \param[in]    q_index         quantizer index
  * \param[in]    block_count     number of transform blocks
- * \param[in]    abs_coeff_mean  array of maximum absolute deviation
+ * \param[in]    abs_coeff_mean  array of mean absolute deviation
  * \param[in]    coeff_num       number of coefficients per transform block
  *
  * \return expected frame rate
@@ -295,15 +438,230 @@
                                        const double *abs_coeff_mean,
                                        int coeff_num);
 
-/*!\brief  Init data structure storing transform stats
+/*
+ *!\brief Compute the number of bits needed to encode a GOP
+ *
+ * \param[in]    q_index_list      array of q_index, one per frame
+ * \param[in]    frame_count       number of frames in the GOP
+ * \param[in]    stats             array of transform stats, one per frame
+ * \param[in]    stats_valid_list  List indicates whether transform stats
+ *                                 exists
+ * \param[out]   bitrate_byframe_list    Array to keep track of frame bitrate
+ *
+ * \return The estimated GOP bitrate.
+ *
+ */
+double av1_estimate_gop_bitrate(const int *q_index_list, const int frame_count,
+                                const TplTxfmStats *stats,
+                                const int *stats_valid_list,
+                                double *bitrate_byframe_list);
+
+/*
+ *!\brief Init TplTxfmStats
+ *
+ * \param[in]    tpl_txfm_stats  a structure for storing transform stats
+ *
+ */
+void av1_init_tpl_txfm_stats(TplTxfmStats *tpl_txfm_stats);
+
+/*
+ *!\brief Accumulate TplTxfmStats
+ *
+ * \param[in]  sub_stats          a structure for storing sub transform stats
+ * \param[out] accumulated_stats  a structure for storing accumulated transform
+ *stats
+ *
+ */
+void av1_accumulate_tpl_txfm_stats(const TplTxfmStats *sub_stats,
+                                   TplTxfmStats *accumulated_stats);
+
+/*
+ *!\brief Record a transform block into  TplTxfmStats
+ *
+ * \param[in]  tpl_txfm_stats     A structure for storing transform stats
+ * \param[out] coeff              An array of transform coefficients. Its size
+ *                                should equal to tpl_txfm_stats.coeff_num.
+ *
+ */
+void av1_record_tpl_txfm_block(TplTxfmStats *tpl_txfm_stats,
+                               const tran_low_t *coeff);
+
+/*!\brief  Estimate coefficient entropy using Laplace dsitribution
  *
  *\ingroup tpl_modelling
  *
- * \param[in]    tpl_frame       pointer of tpl frame data structure
- * \param[in]    coeff_num       number of coefficients per transform block
+ * This function is equivalent to -log2(laplace_prob()), where laplace_prob() is
+ * defined in tpl_model_test.cc
+ *
+ * \param[in]    q_step          quantizer step size without any scaling
+ * \param[in]    b               mean absolute deviation of Laplace distribution
+ * \param[in]    zero_bin_ratio  zero bin's size is zero_bin_ratio * q_step
+ * \param[in]    qcoeff          quantized coefficient
+ *
+ * \return estimated coefficient entropy
  *
  */
-void av1_tpl_stats_init_txfm_stats(TplDepFrame *tpl_frame, int coeff_num);
+double av1_estimate_coeff_entropy(double q_step, double b,
+                                  double zero_bin_ratio, int qcoeff);
+
+/*!\brief  Estimate entropy of a transform block using Laplace dsitribution
+ *
+ *\ingroup tpl_modelling
+ *
+ * \param[in]    q_index         quantizer index
+ * \param[in]    abs_coeff_mean  array of mean absolute deviations
+ * \param[in]    qcoeff_arr      array of quantized coefficients
+ * \param[in]    coeff_num       number of coefficients per transform block
+ *
+ * \return estimated transform block entropy
+ *
+ */
+double av1_estimate_txfm_block_entropy(int q_index,
+                                       const double *abs_coeff_mean,
+                                       int *qcoeff_arr, int coeff_num);
+
+// TODO(angiebird): Add doxygen description here.
+int64_t av1_delta_rate_cost(int64_t delta_rate, int64_t recrf_dist,
+                            int64_t srcrf_dist, int pix_num);
+
+/*!\brief  Compute the overlap area between two blocks with the same size
+ *
+ *\ingroup tpl_modelling
+ *
+ * If there is no overlap, this function should return zero.
+ *
+ * \param[in]    row_a  row position of the first block
+ * \param[in]    col_a  column position of the first block
+ * \param[in]    row_b  row position of the second block
+ * \param[in]    col_b  column position of the second block
+ * \param[in]    width  width shared by the two blocks
+ * \param[in]    height height shared by the two blocks
+ *
+ * \return overlap area of the two blocks
+ */
+int av1_get_overlap_area(int row_a, int col_a, int row_b, int col_b, int width,
+                         int height);
+
+/*!\brief Estimate the optimal base q index for a GOP.
+ *
+ * This function picks q based on a chosen bit rate. It
+ * estimates the bit rate using the starting base q, then uses
+ * a binary search to find q to achieve the specified bit rate.
+ *
+ * \param[in]       gf_group          GOP structure
+ * \param[in]       txfm_stats_list   Transform stats struct
+ * \param[in]       stats_valid_list  List indicates whether transform stats
+ *                                    exists
+ * \param[in]       bit_budget        The specified bit budget to achieve
+ * \param[in]       gf_frame_index    current frame in the GOP
+ * \param[in]       bit_depth         bit depth
+ * \param[in]       scale_factor      Scale factor to improve budget estimation
+ * \param[in]       qstep_ratio_list  Stores the qstep_ratio for each frame
+ * \param[out]      q_index_list      array of q_index, one per frame
+ * \param[out]      estimated_bitrate_byframe  bits usage per frame in the GOP
+ *
+ * \return Returns the optimal base q index to use.
+ */
+int av1_q_mode_estimate_base_q(const struct GF_GROUP *gf_group,
+                               const TplTxfmStats *txfm_stats_list,
+                               const int *stats_valid_list, double bit_budget,
+                               int gf_frame_index, aom_bit_depth_t bit_depth,
+                               double scale_factor,
+                               const double *qstep_ratio_list,
+                               int *q_index_list,
+                               double *estimated_bitrate_byframe);
+
+/*!\brief Get current frame's q_index from tpl stats and leaf_qindex
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_tpl_get_q_index(const TplParams *tpl_data, int gf_frame_index,
+                        int leaf_qindex, aom_bit_depth_t bit_depth);
+
+/*!\brief Compute the ratio between arf q step and the leaf q step based on TPL
+ * stats
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_frame_index    current frame index in the GOP
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return qstep_ratio
+ */
+double av1_tpl_get_qstep_ratio(const TplParams *tpl_data, int gf_frame_index);
+
+/*!\brief Find a q index whose step size is near qstep_ratio * leaf_qstep
+ *
+ * \param[in]       leaf_qindex       q index of leaf frame
+ * \param[in]       qstep_ratio       step ratio between target q index and leaf
+ *                                    q index
+ * \param[in]       bit_depth         bit depth
+ *
+ * \return q_index
+ */
+int av1_get_q_index_from_qstep_ratio(int leaf_qindex, double qstep_ratio,
+                                     aom_bit_depth_t bit_depth);
+
+#if CONFIG_BITRATE_ACCURACY
+/*!\brief Update q_index_list in vbr_rc_info based on tpl stats
+ *
+ * \param[out]      vbr_rc_info    Rate control info for BITRATE_ACCURACY
+ *                                 experiment
+ * \param[in]       tpl_data       TPL struct
+ * \param[in]       gf_group       GOP struct
+ * \param[in]       gf_frame_index current frame index in the GOP
+ * \param[in]       bit_depth      bit depth
+ */
+void av1_vbr_rc_update_q_index_list(VBR_RATECTRL_INFO *vbr_rc_info,
+                                    const TplParams *tpl_data,
+                                    const struct GF_GROUP *gf_group,
+                                    int gf_frame_index,
+                                    aom_bit_depth_t bit_depth);
+
+/*!\brief For a GOP, calculate the bits used by motion vectors.
+ *
+ * \param[in]       tpl_data          TPL struct
+ * \param[in]       gf_group          Pointer to the GOP
+ * \param[in]       gf_frame_index    Current frame index
+ * \param[in]       gf_update_type    Frame update type
+ * \param[in]       vbr_rc_info       Rate control info struct
+ *
+ * \return Bits used by the motion vectors for the GOP.
+ */
+double av1_tpl_compute_mv_bits(const TplParams *tpl_data, int gf_group_size,
+                               int gf_frame_index, int gf_update_type,
+                               VBR_RATECTRL_INFO *vbr_rc_info);
+#endif  // CONFIG_BITRATE_ACCURACY
+
+/*!\brief Improve the motion vector estimation by taking neighbors into account.
+ *
+ * Use the upper and left neighbor block as the reference MVs.
+ * Compute the minimum difference between current MV and reference MV.
+ *
+ * \param[in]       tpl_frame         Tpl frame struct
+ * \param[in]       row               Current row
+ * \param[in]       col               Current column
+ * \param[in]       step              Step parameter for av1_tpl_ptr_pos
+ * \param[in]       tpl_stride        Stride parameter for av1_tpl_ptr_pos
+ * \param[in]       right_shift       Right shift parameter for av1_tpl_ptr_pos
+ */
+int_mv av1_compute_mv_difference(const TplDepFrame *tpl_frame, int row, int col,
+                                 int step, int tpl_stride, int right_shift);
+
+/*!\brief Compute the entropy of motion vectors for a single frame.
+ *
+ * \param[in]       tpl_frame         TPL frame struct
+ * \param[in]       right_shift       right shift value for step
+ *
+ * \return Bits used by the motion vectors for one frame.
+ */
+double av1_tpl_compute_frame_mv_entropy(const TplDepFrame *tpl_frame,
+                                        uint8_t right_shift);
 
 /*!\endcond */
 #ifdef __cplusplus

diff --git a/av1/encoder/tune_butteraugli.c b/av1/encoder/tune_butteraugli.c
index c43cdeb..c5bbee1 100644
--- a/av1/encoder/tune_butteraugli.c
+++ b/av1/encoder/tune_butteraugli.c

@@ -14,7 +14,6 @@
 #include "av1/encoder/tune_butteraugli.h"
 
 #include "aom_dsp/butteraugli.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encoder_utils.h"
 #include "av1/encoder/extend.h"
@@ -27,7 +26,7 @@
                                               const YV12_BUFFER_CONFIG *recon,
                                               const double K) {
   AV1_COMMON *const cm = &cpi->common;
-  SequenceHeader *const seq_params = &cm->seq_params;
+  SequenceHeader *const seq_params = cm->seq_params;
   const CommonModeInfoParams *const mi_params = &cm->mi_params;
   const aom_color_range_t color_range =
       seq_params->color_range != 0 ? AOM_CR_FULL_RANGE : AOM_CR_STUDIO_RANGE;
@@ -42,7 +41,7 @@
   if (!aom_calc_butteraugli(source, recon, bit_depth,
                             seq_params->matrix_coefficients, color_range,
                             diffmap)) {
-    aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+    aom_internal_error(cm->error, AOM_CODEC_ERROR,
                        "Failed to calculate Butteraugli distances.");
   }
 
@@ -148,7 +147,6 @@
   double num_of_mi = 0.0;
   double geom_mean_of_scale = 0.0;
 
-  aom_clear_system_state();
   for (int row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (int col = mi_col / num_mi_h;
@@ -164,7 +162,6 @@
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
   av1_set_error_per_bit(&x->errorperbit, *rdmult);
-  aom_clear_system_state();
 }
 
 static void copy_plane(const uint8_t *src, int src_stride, uint8_t *dst,
@@ -202,7 +199,6 @@
 }
 
 void av1_setup_butteraugli_source(AV1_COMP *cpi) {
-  aom_clear_system_state();
   YV12_BUFFER_CONFIG *const dst = &cpi->butteraugli_info.source;
   AV1_COMMON *const cm = &cpi->common;
   const int width = cpi->source->y_crop_width;
@@ -212,7 +208,7 @@
   const int ss_y = cpi->source->subsampling_y;
   if (dst->buffer_alloc_sz == 0) {
     aom_alloc_frame_buffer(
-        dst, width, height, ss_x, ss_y, cm->seq_params.use_highbitdepth,
+        dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
         cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
   }
   av1_copy_and_extend_frame(cpi->source, dst);
@@ -221,7 +217,7 @@
   if (resized_dst->buffer_alloc_sz == 0) {
     aom_alloc_frame_buffer(
         resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y,
-        cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+        cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
         cm->features.byte_alignment);
   }
   av1_resize_and_extend_frame_nonnormative(cpi->source, resized_dst, bit_depth,
@@ -230,11 +226,9 @@
   zero_img(cpi->source);
   copy_img(resized_dst, cpi->source, width / resize_factor,
            height / resize_factor);
-  aom_clear_system_state();
 }
 
 void av1_setup_butteraugli_rdmult_and_restore_source(AV1_COMP *cpi, double K) {
-  aom_clear_system_state();
   av1_copy_and_extend_frame(&cpi->butteraugli_info.source, cpi->source);
   AV1_COMMON *const cm = &cpi->common;
   const int width = cpi->source->y_crop_width;
@@ -246,7 +240,7 @@
   memset(&resized_recon, 0, sizeof(resized_recon));
   aom_alloc_frame_buffer(
       &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y,
-      cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
   copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor,
            height / resize_factor);
@@ -255,7 +249,6 @@
                                     &resized_recon, K);
   cpi->butteraugli_info.recon_set = true;
   aom_free_frame_buffer(&resized_recon);
-  aom_clear_system_state();
 }
 
 void av1_setup_butteraugli_rdmult(AV1_COMP *cpi) {
@@ -263,7 +256,6 @@
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   const QuantizationCfg *const q_cfg = &oxcf->q_cfg;
   const int q_index = 96;
-  aom_clear_system_state();
 
   // Setup necessary params for encoding, including frame source, etc.
   if (cm->current_frame.frame_type == KEY_FRAME) copy_frame_prob_info(cpi);
@@ -307,7 +299,7 @@
   av1_set_speed_features_qindex_dependent(cpi, oxcf->speed);
   if (q_cfg->deltaq_mode != NO_DELTA_Q || q_cfg->enable_chroma_deltaq)
     av1_init_quantizer(&cpi->enc_quant_dequant_params, &cm->quant_params,
-                       cm->seq_params.bit_depth);
+                       cm->seq_params->bit_depth);
 
   av1_set_variance_partition_thresholds(cpi, q_index, 0);
   av1_encode_frame(cpi);

diff --git a/av1/encoder/tune_vmaf.c b/av1/encoder/tune_vmaf.c
index 6008d64..405dd6c 100644
--- a/av1/encoder/tune_vmaf.c
+++ b/av1/encoder/tune_vmaf.c

@@ -12,7 +12,6 @@
 #include "av1/encoder/tune_vmaf.h"
 
 #include "aom_dsp/psnr.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/extend.h"
 #include "av1/encoder/rdopt.h"
 #include "config/aom_scale_rtcd.h"
@@ -87,9 +86,9 @@
   assert(y_stride == ref->y_stride);
   const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
   const int mv_offset = ref_mv.row * y_stride + ref_mv.col;
-  const unsigned int var =
-      cpi->fn_ptr[block_size].vf(ref->y_buffer + y_offset + mv_offset, y_stride,
-                                 src->y_buffer + y_offset, y_stride, sse);
+  const unsigned int var = cpi->ppi->fn_ptr[block_size].vf(
+      ref->y_buffer + y_offset + mv_offset, y_stride, src->y_buffer + y_offset,
+      y_stride, sse);
   return var;
 }
 
@@ -115,7 +114,7 @@
       buf.buf = (uint8_t *)y_buffer + row_offset_y * y_stride + col_offset_y;
       buf.stride = y_stride;
 
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         assert(frame->flags & YV12_FLAG_HIGHBITDEPTH);
         var += av1_high_get_sby_perpixel_variance(cpi, &buf, block_size,
                                                   bit_depth);
@@ -232,7 +231,7 @@
                                const YV12_BUFFER_CONFIG *blurred,
                                const YV12_BUFFER_CONFIG *dst, double amount) {
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(blurred->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(dst->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -343,10 +342,10 @@
   const int height = source->y_height;
   YV12_BUFFER_CONFIG sharpened;
   memset(&sharpened, 0, sizeof(sharpened));
-  aom_alloc_frame_buffer(&sharpened, width, height, source->subsampling_x,
-                         source->subsampling_y, cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels,
-                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &sharpened, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   const double baseline_variance = frame_average_variance(cpi, source);
   double unsharp_amount;
@@ -380,13 +379,12 @@
 
 void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi,
                                 YV12_BUFFER_CONFIG *const source) {
-  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_width;
   const int height = source->y_height;
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
       AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double best_frame_unsharp_amount =
@@ -396,20 +394,18 @@
 
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&blurred, width, height, source->subsampling_x,
-                         source->subsampling_y, cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels,
-                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, source, &blurred);
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
-  aom_clear_system_state();
 }
 
 void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi,
                                   YV12_BUFFER_CONFIG *const source) {
-  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
   const int width = source->y_width;
@@ -418,20 +414,20 @@
   YV12_BUFFER_CONFIG source_extended, blurred;
   memset(&source_extended, 0, sizeof(source_extended));
   memset(&blurred, 0, sizeof(blurred));
-  aom_alloc_frame_buffer(&source_extended, width, height, source->subsampling_x,
-                         source->subsampling_y, cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels,
-                         cm->features.byte_alignment);
-  aom_alloc_frame_buffer(&blurred, width, height, source->subsampling_x,
-                         source->subsampling_y, cm->seq_params.use_highbitdepth,
-                         cpi->oxcf.border_in_pixels,
-                         cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &source_extended, width, height, source->subsampling_x,
+      source->subsampling_y, cm->seq_params->use_highbitdepth,
+      cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
+  aom_alloc_frame_buffer(
+      &blurred, width, height, source->subsampling_x, source->subsampling_y,
+      cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+      cm->features.byte_alignment);
 
   av1_copy_and_extend_frame(source, &source_extended);
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
       AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_unsharp_amount =
@@ -445,12 +441,10 @@
 
   unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount);
   aom_free_frame_buffer(&blurred);
-  aom_clear_system_state();
 }
 
 void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi,
                                 YV12_BUFFER_CONFIG *const source) {
-  aom_clear_system_state();
   const AV1_COMMON *const cm = &cpi->common;
   const int width = source->y_width;
   const int height = source->y_height;
@@ -462,10 +456,10 @@
   memset(&blurred, 0, sizeof(blurred));
   memset(&source_extended, 0, sizeof(source_extended));
   aom_alloc_frame_buffer(
-      &blurred, width, height, ss_x, ss_y, cm->seq_params.use_highbitdepth,
+      &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
       cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
   aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
 
@@ -473,7 +467,7 @@
   gaussian_blur(bit_depth, &source_extended, &blurred);
   aom_free_frame_buffer(&source_extended);
 
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
       AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_unsharp_amount =
@@ -499,11 +493,11 @@
   memset(&source_block, 0, sizeof(source_block));
   memset(&blurred_block, 0, sizeof(blurred_block));
   aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
 
@@ -515,7 +509,7 @@
       const int block_height = AOMMIN(height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
+      if (cm->seq_params->use_highbitdepth) {
         assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
         assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *frame_src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
@@ -584,7 +578,7 @@
       const int block_height = AOMMIN(source->y_height - row_offset_y, block_h);
       const int index = col + row * num_cols;
 
-      if (cm->seq_params.use_highbitdepth) {
+      if (cm->seq_params->use_highbitdepth) {
         assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
         assert(blurred.flags & YV12_FLAG_HIGHBITDEPTH);
         uint16_t *src_buf = CONVERT_TO_SHORTPTR(source->y_buffer) +
@@ -611,7 +605,6 @@
   aom_free_frame_buffer(&blurred_block);
   aom_free_frame_buffer(&blurred);
   aom_free(best_unsharp_amounts);
-  aom_clear_system_state();
 }
 
 void av1_set_mb_vmaf_rdmult_scaling(AV1_COMP *cpi) {
@@ -624,12 +617,11 @@
   const int ss_x = cpi->source->subsampling_x;
   const int ss_y = cpi->source->subsampling_y;
 
-  aom_clear_system_state();
   YV12_BUFFER_CONFIG resized_source;
   memset(&resized_source, 0, sizeof(resized_source));
   aom_alloc_frame_buffer(
       &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x,
-      ss_y, cm->seq_params.use_highbitdepth, cpi->oxcf.border_in_pixels,
+      ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
       cm->features.byte_alignment);
   av1_resize_and_extend_frame_nonnormative(cpi->source, &resized_source,
                                            bit_depth, av1_num_planes(cm));
@@ -646,7 +638,7 @@
   YV12_BUFFER_CONFIG blurred;
   memset(&blurred, 0, sizeof(blurred));
   aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x,
-                         ss_y, cm->seq_params.use_highbitdepth,
+                         ss_y, cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   gaussian_blur(bit_depth, &resized_source, &blurred);
@@ -654,7 +646,7 @@
   YV12_BUFFER_CONFIG recon;
   memset(&recon, 0, sizeof(recon));
   aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_yv12_copy_frame(&resized_source, &recon, 1);
@@ -679,14 +671,14 @@
       uint8_t *const blurred_buf =
           blurred.y_buffer + row_offset_y * blurred.y_stride + col_offset_y;
 
-      cpi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
-                                         blurred_buf, blurred.y_stride,
-                                         &sses[index]);
+      cpi->ppi->fn_ptr[resized_block_size].vf(orig_buf, resized_source.y_stride,
+                                              blurred_buf, blurred.y_stride,
+                                              &sses[index]);
 
       uint8_t *const recon_buf =
           recon.y_buffer + row_offset_y * recon.y_stride + col_offset_y;
       // Set recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         highbd_unsharp_rect(CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
                             CONVERT_TO_SHORTPTR(blurred_buf), blurred.y_stride,
                             CONVERT_TO_SHORTPTR(recon_buf), recon.y_stride,
@@ -701,7 +693,7 @@
                           index);
 
       // Restore recon buf
-      if (cpi->common.seq_params.use_highbitdepth) {
+      if (cpi->common.seq_params->use_highbitdepth) {
         highbd_unsharp_rect(
             CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
             CONVERT_TO_SHORTPTR(orig_buf), resized_source.y_stride,
@@ -742,7 +734,6 @@
   aom_free_frame_buffer(&blurred);
   aom_close_vmaf_context(vmaf_context);
   aom_free(sses);
-  aom_clear_system_state();
 }
 
 void av1_set_vmaf_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -761,7 +752,6 @@
   double num_of_mi = 0.0;
   double geom_mean_of_scale = 0.0;
 
-  aom_clear_system_state();
   for (row = mi_row / num_mi_w;
        row < num_rows && row < mi_row / num_mi_w + num_brows; ++row) {
     for (col = mi_col / num_mi_h;
@@ -776,7 +766,6 @@
   *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale + 0.5);
   *rdmult = AOMMAX(*rdmult, 0);
   av1_set_error_per_bit(&x->errorperbit, *rdmult);
-  aom_clear_system_state();
 }
 
 // TODO(sdeng): replace them with the SIMD versions.
@@ -833,15 +822,15 @@
   memset(&blurred_next, 0, sizeof(blurred_next));
 
   aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
 
@@ -850,7 +839,7 @@
   if (next) gaussian_blur(bit_depth, next, &blurred_next);
 
   double motion1, motion2 = 65536.0;
-  if (cm->seq_params.use_highbitdepth) {
+  if (cm->seq_params->use_highbitdepth) {
     assert(blurred_cur.flags & YV12_FLAG_HIGHBITDEPTH);
     assert(blurred_last.flags & YV12_FLAG_HIGHBITDEPTH);
     const float scale_factor = 1.0f / (float)(1 << (bit_depth - 8));
@@ -889,7 +878,7 @@
                                            YV12_BUFFER_CONFIG **last,
                                            YV12_BUFFER_CONFIG **next) {
   const AV1_COMMON *const cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->gf_group;
+  const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const int src_index =
       cm->show_frame != 0 ? 0 : gf_group->arf_src_offset[cpi->gf_frame_index];
   struct lookahead_entry *last_entry = av1_lookahead_peek(
@@ -908,8 +897,7 @@
   if (cm->current_frame.frame_number == 0 || cpi->oxcf.pass == 1) {
     return current_qindex;
   }
-  aom_clear_system_state();
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
       AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   const double last_frame_ysse =
@@ -947,13 +935,13 @@
   const double dsse = dvmaf * approx_sse / approx_dvmaf;
 
   const double beta = approx_sse / (dsse + approx_sse);
-  const int offset = av1_get_deltaq_offset(cpi, current_qindex, beta);
+  const int offset =
+      av1_get_deltaq_offset(cm->seq_params->bit_depth, current_qindex, beta);
   int qindex = current_qindex + offset;
 
   qindex = AOMMIN(qindex, MAXQ);
   qindex = AOMMAX(qindex, MINQ);
 
-  aom_clear_system_state();
   return qindex;
 }
 
@@ -1026,19 +1014,19 @@
   memset(&recon_blurred, 0, sizeof(recon_blurred));
   memset(&src_blurred, 0, sizeof(src_blurred));
   aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y,
-                         cm->seq_params.use_highbitdepth,
+                         cm->seq_params->use_highbitdepth,
                          cpi->oxcf.border_in_pixels,
                          cm->features.byte_alignment);
   aom_alloc_frame_buffer(
-      &src_blurred, width, height, ss_x, ss_y, cm->seq_params.use_highbitdepth,
+      &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth,
       cpi->oxcf.border_in_pixels, cm->features.byte_alignment);
 
   gaussian_blur(bit_depth, recon, &recon_blurred);
@@ -1084,7 +1072,7 @@
   YV12_BUFFER_CONFIG *source = cpi->source;
   YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
   const int bit_depth = cpi->td.mb.e_mbd.bd;
-  const GF_GROUP *const gf_group = &cpi->gf_group;
+  const GF_GROUP *const gf_group = &cpi->ppi->gf_group;
   const int layer_depth =
       AOMMIN(gf_group->layer_depth[cpi->gf_frame_index], MAX_ARF_LAYERS - 1);
   double base_score;
@@ -1093,7 +1081,7 @@
   aom_calc_vmaf(cpi->vmaf_info.vmaf_model, source, recon, bit_depth,
                 cal_vmaf_neg, &base_score);
   cpi->vmaf_info.last_frame_vmaf[layer_depth] = base_score;
-  if (cpi->common.seq_params.use_highbitdepth) {
+  if (cpi->common.seq_params->use_highbitdepth) {
     assert(source->flags & YV12_FLAG_HIGHBITDEPTH);
     assert(recon->flags & YV12_FLAG_HIGHBITDEPTH);
     cpi->vmaf_info.last_frame_ysse[layer_depth] =

diff --git a/av1/encoder/tx_search.c b/av1/encoder/tx_search.c
index 10a9a87..a3ac0d9 100644
--- a/av1/encoder/tx_search.c
+++ b/av1/encoder/tx_search.c

@@ -17,10 +17,13 @@
 #include "av1/encoder/model_rd.h"
 #include "av1/encoder/random.h"
 #include "av1/encoder/rdopt_utils.h"
+#include "av1/encoder/sorting_network.h"
 #include "av1/encoder/tx_prune_model_weights.h"
 #include "av1/encoder/tx_search.h"
 #include "av1/encoder/txb_rdopt.h"
 
+#define PROB_THRESH_OFFSET_TX_TYPE 100
+
 struct rdcost_block_args {
   const AV1_COMP *cpi;
   MACROBLOCK *x;
@@ -46,11 +49,6 @@
   int8_t children[4];
 } RD_RECORD_IDX_NODE;
 
-typedef struct tx_size_rd_info_node {
-  TXB_RD_INFO *rd_info_array;  // Points to array of size TX_TYPES.
-  struct tx_size_rd_info_node *children[4];
-} TXB_RD_INFO_NODE;
-
 // origin_threshold * 128 / 100
 static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
   {
@@ -84,256 +82,6 @@
                                                      12, 12, 23, 23, 32, 32, 8,
                                                      8,  16, 16, 23, 23 };
 
-static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
-                                const uint32_t hash) {
-  // Linear search through the circular buffer to find matching hash.
-  for (int i = cur_record->index_start - 1; i >= 0; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
-    if (cur_record->hash_vals[i] == hash) return i;
-  }
-  int index;
-  // If not found - add new RD info into the buffer and return its index
-  if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
-    index = (cur_record->index_start + cur_record->num) %
-            TX_SIZE_RD_RECORD_BUFFER_LEN;
-    cur_record->num++;
-  } else {
-    index = cur_record->index_start;
-    cur_record->index_start =
-        (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
-  }
-
-  cur_record->hash_vals[index] = hash;
-  av1_zero(cur_record->tx_rd_info[index]);
-  return index;
-}
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0, 0, 0, 0 } },
-  { 1, { 0, 0, 0, 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 1, { 0 } },
-  { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
-  { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 5, 6 } },
-  { 0, { 7, 8, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, 7, 8 } },
-  { 0, { 5, 6, 9, 10 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
-  { 0, { 1, 2, 3, 4 } },     { 0, { 5, 6, 9, 10 } },    { 0, { 7, 8, 11, 12 } },
-  { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
-  { 0, { 2, 3, 4, 5 } },     { 0, { 6, 7, 8, 9 } },
-  { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
-  { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
-  { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
-  { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
-  { 0, { 2, 3, 6, 7 } },     { 0, { 4, 5, 8, 9 } },
-  { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
-  { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
-  { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
-  { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
-  { 0, { 4, 5, 8, 9 } },     { 0, { 6, 7, 10, 11 } },
-  { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
-  { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
-  { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
-  { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
-  { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
-  { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
-  { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
-  { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
-  { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
-  { 0, { 1, -1, 2, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
-  { 0, { 1, 2, -1, -1 } },
-  { 0, { 3, 4, -1, -1 } },
-  { 0, { 5, 6, -1, -1 } },
-};
-
-static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
-  NULL,                    // BLOCK_4X4
-  NULL,                    // BLOCK_4X8
-  NULL,                    // BLOCK_8X4
-  rd_record_tree_8x8,      // BLOCK_8X8
-  rd_record_tree_8x16,     // BLOCK_8X16
-  rd_record_tree_16x8,     // BLOCK_16X8
-  rd_record_tree_16x16,    // BLOCK_16X16
-  rd_record_tree_1_2,      // BLOCK_16X32
-  rd_record_tree_2_1,      // BLOCK_32X16
-  rd_record_tree_sqr,      // BLOCK_32X32
-  rd_record_tree_1_2,      // BLOCK_32X64
-  rd_record_tree_2_1,      // BLOCK_64X32
-  rd_record_tree_sqr,      // BLOCK_64X64
-  rd_record_tree_64x128,   // BLOCK_64X128
-  rd_record_tree_128x64,   // BLOCK_128X64
-  rd_record_tree_128x128,  // BLOCK_128X128
-  NULL,                    // BLOCK_4X16
-  NULL,                    // BLOCK_16X4
-  rd_record_tree_1_4,      // BLOCK_8X32
-  rd_record_tree_4_1,      // BLOCK_32X8
-  rd_record_tree_1_4,      // BLOCK_16X64
-  rd_record_tree_4_1,      // BLOCK_64X16
-};
-
-static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
-  0,                                                            // BLOCK_4X4
-  0,                                                            // BLOCK_4X8
-  0,                                                            // BLOCK_8X4
-  sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X8
-  sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_8X16
-  sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE),     // BLOCK_16X8
-  sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE),    // BLOCK_16X16
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X32
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X16
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X32
-  sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X64
-  sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X32
-  sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X64
-  sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_64X128
-  sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE),   // BLOCK_128X64
-  sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE),  // BLOCK_128X128
-  0,                                                            // BLOCK_4X16
-  0,                                                            // BLOCK_16X4
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_8X32
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_32X8
-  sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_16X64
-  sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE),      // BLOCK_64X16
-};
-
-static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
-                                       BLOCK_SIZE bsize) {
-  const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
-  const int size = rd_record_tree_size[bsize];
-  for (int i = 0; i < size; ++i) {
-    if (rd_record[i].leaf) {
-      av1_zero(tree[i].children);
-    } else {
-      for (int j = 0; j < 4; ++j) {
-        const int8_t idx = rd_record[i].children[j];
-        tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
-      }
-    }
-  }
-}
-
-// Go through all TX blocks that could be used in TX size search, compute
-// residual hash values for them and find matching RD info that stores previous
-// RD search results for these TX blocks. The idea is to prevent repeated
-// rate/distortion computations that happen because of the combination of
-// partition and TX size search. The resulting RD info records are returned in
-// the form of a quadtree for easier access in actual TX size search.
-static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize,
-                                   TXB_RD_INFO_NODE *dst_rd_info) {
-  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  TXB_RD_RECORD *rd_records_table[4] = {
-    txfm_info->txb_rd_records->txb_rd_record_8X8,
-    txfm_info->txb_rd_records->txb_rd_record_16X16,
-    txfm_info->txb_rd_records->txb_rd_record_32X32,
-    txfm_info->txb_rd_records->txb_rd_record_64X64
-  };
-  const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
-  const int bw = block_size_wide[bsize];
-  const int bh = block_size_high[bsize];
-
-  // Hashing is performed only for square TX sizes larger than TX_4X4
-  if (max_square_tx_size < TX_8X8) return 0;
-  const int diff_stride = bw;
-  const struct macroblock_plane *const p = &x->plane[0];
-  const int16_t *diff = &p->src_diff[0];
-  init_rd_record_tree(dst_rd_info, bsize);
-  // Coordinates of the top-left corner of current block within the superblock
-  // measured in pixels:
-  const int mi_row = x->e_mbd.mi_row;
-  const int mi_col = x->e_mbd.mi_col;
-  const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
-  int cur_rd_info_idx = 0;
-  int cur_tx_depth = 0;
-  TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
-  while (cur_tx_depth <= MAX_VARTX_DEPTH) {
-    const int cur_tx_bw = tx_size_wide[cur_tx_size];
-    const int cur_tx_bh = tx_size_high[cur_tx_size];
-    if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
-    const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
-    const int tx_size_idx = cur_tx_size - TX_8X8;
-    for (int row = 0; row < bh; row += cur_tx_bh) {
-      for (int col = 0; col < bw; col += cur_tx_bw) {
-        if (cur_tx_bw != cur_tx_bh) {
-          // Use dummy nodes for all rectangular transforms within the
-          // TX size search tree.
-          dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
-        } else {
-          // Get spatial location of this TX block within the superblock
-          // (measured in cur_tx_bsize units).
-          const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
-          const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
-
-          int16_t hash_data[MAX_SB_SQUARE];
-          int16_t *cur_hash_row = hash_data;
-          const int16_t *cur_diff_row = diff + row * diff_stride + col;
-          for (int i = 0; i < cur_tx_bh; i++) {
-            memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
-            cur_hash_row += cur_tx_bw;
-            cur_diff_row += diff_stride;
-          }
-          const int hash = av1_get_crc32c_value(
-              &txfm_info->txb_rd_records->mb_rd_record.crc_calculator,
-              (uint8_t *)hash_data, 2 * cur_tx_bw * cur_tx_bh);
-          // Find corresponding RD info based on the hash value.
-          const int record_idx =
-              row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
-          TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
-          int idx = find_tx_size_rd_info(records, hash);
-          dst_rd_info[cur_rd_info_idx].rd_info_array =
-              &records->tx_rd_info[idx];
-        }
-        ++cur_rd_info_idx;
-      }
-    }
-    cur_tx_size = next_tx_size;
-    ++cur_tx_depth;
-  }
-  return 1;
-}
-
 static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
   const int rows = block_size_high[bsize];
   const int cols = block_size_wide[bsize];
@@ -421,7 +169,6 @@
   int sum = 0;
   sse = aom_sum_sse_2d_i16(diff, diff_stride, visible_cols, visible_rows, &sum);
   if (visible_cols > 0 && visible_rows > 0) {
-    aom_clear_system_state();
     double norm_factor = 1.0 / (visible_cols * visible_rows);
     int sign_sum = sum > 0 ? 1 : -1;
     // Conversion to transform domain
@@ -593,8 +340,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node);
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode);
 
 // NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
 // 0: Do not collect any RD stats
@@ -618,7 +364,7 @@
     assert(bw <= 32);
     assert(bh <= 32);
     assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
-    if (cpi->common.seq_params.use_highbitdepth) {
+    if (cpi->common.seq_params->use_highbitdepth) {
       const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
       const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
       for (int i = 0; i < bh; ++i)
@@ -643,43 +389,43 @@
     const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
     assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
     assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[1]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[2]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[3]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[1]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[2]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[3]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[5]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[6]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[7]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[5]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[6]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[7]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[9]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[10]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[11]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[9]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[10]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[11]);
     src += bh / 4 * src_stride;
     dst += bh / 4 * dst_stride;
 
-    cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
-    cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
-                            &esq[13]);
-    cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
-                            &esq[14]);
-    cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
-                            dst_stride, &esq[15]);
+    cpi->ppi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                 dst_stride, &esq[13]);
+    cpi->ppi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                 dst_stride, &esq[14]);
+    cpi->ppi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+                                 dst_stride, &esq[15]);
   }
 
   double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
@@ -769,13 +515,13 @@
 
         if (sse_norm_arr) {
           unsigned int this_sse;
-          cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
-                                        dst_stride, &this_sse);
+          cpi->ppi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+                                             dst_stride, &this_sse);
           sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
         }
 
         if (sad_norm_arr) {
-          const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+          const unsigned int this_sad = cpi->ppi->fn_ptr[tx_bsize_half].sdf(
               this_src, src_stride, this_dst, dst_stride);
           sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
         }
@@ -832,11 +578,11 @@
   const uint8_t *const dst =
       &pd->dst.buf[(blk_row * dst_stride + blk_col) << MI_SIZE_LOG2];
   unsigned int sse;
-  cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+  cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm = (double)sad / num_samples;
 
   fprintf(fout, " %g %g", sse_norm, sad_norm);
@@ -905,8 +651,8 @@
 
     if (x->skip_chroma_rd && plane) continue;
 
-    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
-                       &sse);
+    cpi->ppi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf,
+                            pd->dst.stride, &sse);
     total_sse += sse;
   }
   total_sse <<= 4;
@@ -916,7 +662,6 @@
 static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
                              int64_t sse, int *est_residue_cost,
                              int64_t *est_dist) {
-  aom_clear_system_state();
   const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
   if (md->ready) {
     if (sse < md->dist_mean) {
@@ -1030,7 +775,7 @@
   const double sse_norm = (double)sse / num_samples;
 
   const unsigned int sad =
-      cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+      cpi->ppi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
   const double sad_norm =
       (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
 
@@ -1183,7 +928,7 @@
   unsigned sse;
 
   if (txb_rows == visible_rows && txb_cols == visible_cols) {
-    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+    cpi->ppi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
     return sse;
   }
 
@@ -1276,66 +1021,10 @@
                          blk_row, blk_col, plane_bsize, tx_bsize);
 }
 
-static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
-                                   int blk_col, BLOCK_SIZE plane_bsize,
-                                   TX_SIZE tx_size) {
-  int16_t tmp_data[64 * 64];
-  const int diff_stride = block_size_wide[plane_bsize];
-  const int16_t *diff = x->plane[plane].src_diff;
-  const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
-  const int txb_w = tx_size_wide[tx_size];
-  const int txb_h = tx_size_high[tx_size];
-  uint8_t *hash_data = (uint8_t *)cur_diff_row;
-  if (txb_w != diff_stride) {
-    int16_t *cur_hash_row = tmp_data;
-    for (int i = 0; i < txb_h; i++) {
-      memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
-      cur_hash_row += txb_w;
-      cur_diff_row += diff_stride;
-    }
-    hash_data = (uint8_t *)tmp_data;
-  }
-  CRC32C *crc =
-      &x->txfm_search_info.txb_rd_records->mb_rd_record.crc_calculator;
-  const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
-  return (hash << 5) + tx_size;
-}
-
 // pruning thresholds for prune_txk_type and prune_txk_type_separ
 static const int prune_factors[5] = { 200, 200, 120, 80, 40 };  // scale 1000
 static const int mul_factors[5] = { 80, 80, 70, 50, 30 };       // scale 100
 
-static INLINE int is_intra_hash_match(const AV1_COMP *cpi, MACROBLOCK *x,
-                                      int plane, int blk_row, int blk_col,
-                                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                      const TXB_CTX *const txb_ctx,
-                                      TXB_RD_INFO **intra_txb_rd_info,
-                                      const int tx_type_map_idx,
-                                      uint16_t *cur_joint_ctx) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  TxfmSearchInfo *txfm_info = &x->txfm_search_info;
-  assert(cpi->sf.tx_sf.use_intra_txb_hash &&
-         frame_is_intra_only(&cpi->common) && !is_inter_block(xd->mi[0]) &&
-         plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]);
-  const uint32_t intra_hash =
-      get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
-  const int intra_hash_idx = find_tx_size_rd_info(
-      &txfm_info->txb_rd_records->txb_rd_record_intra, intra_hash);
-  *intra_txb_rd_info = &txfm_info->txb_rd_records->txb_rd_record_intra
-                            .tx_rd_info[intra_hash_idx];
-  *cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  if ((*intra_txb_rd_info)->entropy_context == *cur_joint_ctx &&
-      txfm_info->txb_rd_records->txb_rd_record_intra.tx_rd_info[intra_hash_idx]
-          .valid) {
-    xd->tx_type_map[tx_type_map_idx] = (*intra_txb_rd_info)->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(xd, get_plane_type(plane), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    return (ref_tx_type == (*intra_txb_rd_info)->tx_type);
-  }
-  return 0;
-}
-
 // R-D costs are sorted in ascending order.
 static INLINE void sort_rd(int64_t rds[], int txk[], int len) {
   int i, j, k;
@@ -1397,6 +1086,7 @@
                               int reduced_tx_set_used, int64_t ref_best_rd,
                               int num_sel) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
 
   int idx;
 
@@ -1438,6 +1128,9 @@
     tx_type = idx_map[idx];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
@@ -1468,6 +1161,9 @@
     tx_type = idx_map_v[idx_v[idx] * 4];
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
 
@@ -1528,6 +1224,7 @@
                         uint16_t allowed_tx_mask, int prune_factor,
                         const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
   const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
   int tx_type;
 
   int64_t rds[TX_TYPES];
@@ -1552,6 +1249,9 @@
     }
     txfm_param.tx_type = tx_type;
 
+    av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
+                      &quant_param);
+
     // do txfm and quantization
     av1_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, &txfm_param,
                     &quant_param);
@@ -1645,32 +1345,6 @@
   NULL,
 };
 
-// Probablities are sorted in descending order.
-static INLINE void sort_probability(float prob[], int txk[], int len) {
-  int i, j, k;
-
-  for (i = 1; i <= len - 1; ++i) {
-    for (j = 0; j < i; ++j) {
-      if (prob[j] < prob[i]) {
-        float temp;
-        int tempi;
-
-        temp = prob[i];
-        tempi = txk[i];
-
-        for (k = i; k > j; k--) {
-          prob[k] = prob[k - 1];
-          txk[k] = txk[k - 1];
-        }
-
-        prob[j] = temp;
-        txk[j] = tempi;
-        break;
-      }
-    }
-  }
-}
-
 static INLINE float get_adaptive_thresholds(
     TX_SIZE tx_size, TxSetType tx_set_type,
     TX_TYPE_PRUNE_MODE prune_2d_txfm_mode) {
@@ -1752,11 +1426,25 @@
   for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
 }
 
+static AOM_INLINE bool check_bit_mask(uint16_t mask, int val) {
+  return mask & (1 << val);
+}
+
+static AOM_INLINE void set_bit_mask(uint16_t *mask, int val) {
+  *mask |= (1 << val);
+}
+
+static AOM_INLINE void unset_bit_mask(uint16_t *mask, int val) {
+  *mask &= ~(1 << val);
+}
+
 static void prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
                         int blk_row, int blk_col, TxSetType tx_set_type,
                         TX_TYPE_PRUNE_MODE prune_2d_txfm_mode, int *txk_map,
                         uint16_t *allowed_tx_mask) {
-  int tx_type_table_2D[16] = {
+  // This table is used because the search order is different from the enum
+  // order.
+  static const int tx_type_table_2D[16] = {
     DCT_DCT,      DCT_ADST,      DCT_FLIPADST,      V_DCT,
     ADST_DCT,     ADST_ADST,     ADST_FLIPADST,     V_ADST,
     FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
@@ -1774,11 +1462,9 @@
 #endif
   if (!nn_config_hor || !nn_config_ver) return;  // Model not established yet.
 
-  aom_clear_system_state();
   float hfeatures[16], vfeatures[16];
   float hscores[4], vscores[4];
   float scores_2D_raw[16];
-  float scores_2D[16];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
   const int hfeatures_num = bw <= 8 ? bw : bw / 2;
@@ -1791,10 +1477,11 @@
   const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
                                 vfeatures);
+
   av1_get_horver_correlation_full(diff, diff_stride, bw, bh,
                                   &hfeatures[hfeatures_num - 1],
                                   &vfeatures[vfeatures_num - 1]);
-  aom_clear_system_state();
+
 #if CONFIG_NN_V2
   av1_nn_predict_v2(hfeatures, nn_config_hor, 0, hscores);
   av1_nn_predict_v2(vfeatures, nn_config_ver, 0, vscores);
@@ -1802,7 +1489,6 @@
   av1_nn_predict(hfeatures, nn_config_hor, 1, hscores);
   av1_nn_predict(vfeatures, nn_config_ver, 1, vscores);
 #endif
-  aom_clear_system_state();
 
   for (int i = 0; i < 4; i++) {
     float *cur_scores_2D = scores_2D_raw + i * 4;
@@ -1812,7 +1498,11 @@
     cur_scores_2D[3] = vscores[i] * hscores[3];
   }
 
-  av1_nn_softmax(scores_2D_raw, scores_2D, 16);
+  assert(TX_TYPES == 16);
+  // This version of the function only works when there are at most 16 classes.
+  // So we will need to change the optimization or use av1_nn_softmax instead if
+  // this ever gets changed.
+  av1_nn_fast_softmax_16(scores_2D_raw, scores_2D_raw);
 
   const float score_thresh =
       get_adaptive_thresholds(tx_size, tx_set_type, prune_2d_txfm_mode);
@@ -1825,27 +1515,54 @@
   float sum_score = 0.0;
   // Calculate sum of allowed tx type score and Populate allow bit mask based
   // on score_thresh and allowed_tx_mask
+  int allow_count = 0;
+  int tx_type_allowed[16] = { TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID, TX_TYPE_INVALID, TX_TYPE_INVALID,
+                              TX_TYPE_INVALID };
+  float scores_2D[16] = {
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+  };
   for (int tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
-    int allow_tx_type = *allowed_tx_mask & (1 << tx_type_table_2D[tx_idx]);
-    if (scores_2D[tx_idx] > max_score && allow_tx_type) {
-      max_score = scores_2D[tx_idx];
+    const int allow_tx_type =
+        check_bit_mask(*allowed_tx_mask, tx_type_table_2D[tx_idx]);
+    if (!allow_tx_type) {
+      continue;
+    }
+    if (scores_2D_raw[tx_idx] > max_score) {
+      max_score = scores_2D_raw[tx_idx];
       max_score_i = tx_idx;
     }
-    if (scores_2D[tx_idx] >= score_thresh && allow_tx_type) {
+    if (scores_2D_raw[tx_idx] >= score_thresh) {
       // Set allow mask based on score_thresh
-      allow_bitmask |= (1 << tx_type_table_2D[tx_idx]);
+      set_bit_mask(&allow_bitmask, tx_type_table_2D[tx_idx]);
 
       // Accumulate score of allowed tx type
-      sum_score += scores_2D[tx_idx];
+      sum_score += scores_2D_raw[tx_idx];
+
+      scores_2D[allow_count] = scores_2D_raw[tx_idx];
+      tx_type_allowed[allow_count] = tx_type_table_2D[tx_idx];
+      allow_count += 1;
     }
   }
-  if (!((allow_bitmask >> max_score_i) & 0x01)) {
-    // Set allow mask based on tx type with max score
-    allow_bitmask |= (1 << tx_type_table_2D[max_score_i]);
-    sum_score += scores_2D[max_score_i];
+  if (!check_bit_mask(allow_bitmask, tx_type_table_2D[max_score_i])) {
+    // If even the tx_type with max score is pruned, this means that no other
+    // tx_type is feasible. When this happens, we force enable max_score_i and
+    // end the search.
+    set_bit_mask(&allow_bitmask, tx_type_table_2D[max_score_i]);
+    memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+    *allowed_tx_mask = allow_bitmask;
+    return;
   }
+
   // Sort tx type probability of all types
-  sort_probability(scores_2D, tx_type_table_2D, TX_TYPES);
+  if (allow_count <= 8) {
+    av1_sort_fi32_8(scores_2D, tx_type_allowed);
+  } else {
+    av1_sort_fi32_16(scores_2D, tx_type_allowed);
+  }
 
   // Enable more pruning based on tx type probability and number of allowed tx
   // types
@@ -1855,26 +1572,25 @@
     int tx_idx, tx_count = 0;
     const float inv_sum_score = 100 / sum_score;
     // Get allowed tx types based on sorted probability score and tx count
-    for (tx_idx = 0; tx_idx < TX_TYPES; tx_idx++) {
+    for (tx_idx = 0; tx_idx < allow_count; tx_idx++) {
       // Skip the tx type which has more than 30% of cumulative
       // probability and allowed tx type count is more than 2
       if (score_ratio > 30.0 && tx_count >= 2) break;
 
-      // Calculate cumulative probability of allowed tx types
-      if (allow_bitmask & (1 << tx_type_table_2D[tx_idx])) {
-        // Calculate cumulative probability
-        temp_score += scores_2D[tx_idx];
+      assert(check_bit_mask(allow_bitmask, tx_type_allowed[tx_idx]));
+      // Calculate cumulative probability
+      temp_score += scores_2D[tx_idx];
 
-        // Calculate percentage of cumulative probability of allowed tx type
-        score_ratio = temp_score * inv_sum_score;
-        tx_count++;
-      }
+      // Calculate percentage of cumulative probability of allowed tx type
+      score_ratio = temp_score * inv_sum_score;
+      tx_count++;
     }
     // Set remaining tx types as pruned
-    for (; tx_idx < TX_TYPES; tx_idx++)
-      allow_bitmask &= ~(1 << tx_type_table_2D[tx_idx]);
+    for (; tx_idx < allow_count; tx_idx++)
+      unset_bit_mask(&allow_bitmask, tx_type_allowed[tx_idx]);
   }
-  memcpy(txk_map, tx_type_table_2D, sizeof(tx_type_table_2D));
+
+  memcpy(txk_map, tx_type_allowed, sizeof(tx_type_table_2D));
   *allowed_tx_mask = allow_bitmask;
 }
 
@@ -1911,7 +1627,6 @@
       total_x_sum += x_sum;
       total_x2_sum += x2_sum;
 
-      aom_clear_system_state();
       const float mean = (float)x_sum / sub_num;
       const float dev = get_dev(mean, (double)x2_sum, sub_num);
       feature[feature_idx++] = mean;
@@ -1944,14 +1659,12 @@
       x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
-  aom_clear_system_state();
 
   float features[64] = { 0.0f };
   get_mean_dev_features(diff, diff_stride, bw, bh, features);
 
   float score = 0.0f;
   av1_nn_predict(features, nn_config, 1, &score);
-  aom_clear_system_state();
 
   int int_score = (int)(score * 10000);
   return clamp(int_score, -80000, 80000);
@@ -1972,10 +1685,38 @@
   // TX_TYPES, only that specific tx type is allowed.
   TX_TYPE txk_allowed = TX_TYPES;
 
+  const FRAME_UPDATE_TYPE update_type =
+      get_frame_update_type(&cpi->ppi->gf_group, cpi->gf_frame_index);
+  const int *tx_type_probs =
+      cpi->ppi->frame_probs.tx_type_probs[update_type][tx_size];
+
   if ((!is_inter && txfm_params->use_default_intra_tx_type) ||
-      (is_inter && txfm_params->use_default_inter_tx_type)) {
+      (is_inter && txfm_params->default_inter_tx_type_prob_thresh == 0)) {
     txk_allowed =
         get_default_tx_type(0, xd, tx_size, cpi->use_screen_content_tools);
+  } else if (is_inter &&
+             txfm_params->default_inter_tx_type_prob_thresh != INT_MAX) {
+    if (tx_type_probs[DEFAULT_INTER_TX_TYPE] >
+        txfm_params->default_inter_tx_type_prob_thresh) {
+      txk_allowed = DEFAULT_INTER_TX_TYPE;
+    } else {
+      int force_tx_type = 0;
+      int max_prob = 0;
+      const int tx_type_prob_threshold =
+          txfm_params->default_inter_tx_type_prob_thresh +
+          PROB_THRESH_OFFSET_TX_TYPE;
+      for (int i = 1; i < TX_TYPES; i++) {  // find maximum probability.
+        if (tx_type_probs[i] > max_prob) {
+          max_prob = tx_type_probs[i];
+          force_tx_type = i;
+        }
+      }
+      if (max_prob > tx_type_prob_threshold)  // force tx type with max prob.
+        txk_allowed = force_tx_type;
+      else if (x->rd_model == LOW_TXFM_RD) {
+        if (plane == 0) txk_allowed = DCT_DCT;
+      }
+    }
   } else if (x->rd_model == LOW_TXFM_RD) {
     if (plane == 0) txk_allowed = DCT_DCT;
   }
@@ -2024,10 +1765,6 @@
     assert(plane == 0);
     allowed_tx_mask = ext_tx_used_flag;
     int num_allowed = 0;
-    const FRAME_UPDATE_TYPE update_type =
-        get_frame_update_type(&cpi->gf_group, cpi->gf_frame_index);
-    const int *tx_type_probs =
-        cpi->frame_probs.tx_type_probs[update_type][tx_size];
     int i;
 
     if (cpi->sf.tx_sf.tx_type_search.prune_tx_type_using_stats) {
@@ -2098,13 +1835,8 @@
 
 #if CONFIG_RD_DEBUG
 static INLINE void update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
-                                         int blk_row, int blk_col,
                                          int txb_coeff_cost) {
-  (void)blk_row;
-  (void)blk_col;
   rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
-  assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
-  assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
 }
 #endif
 
@@ -2250,45 +1982,6 @@
   skip_trellis |= !is_trellis_used(cpi->optimize_seg_arr[xd->mi[0]->segment_id],
                                    DRY_RUN_NORMAL);
 
-  // Hashing based speed feature for intra block. If the hash of the residue
-  // is found in the hash table, use the previous RD search results stored in
-  // the table and terminate early.
-  TXB_RD_INFO *intra_txb_rd_info = NULL;
-  uint16_t cur_joint_ctx = 0;
-  const int is_inter = is_inter_block(mbmi);
-  const int use_intra_txb_hash =
-      cpi->sf.tx_sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
-      !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size];
-  if (use_intra_txb_hash) {
-    const int mi_row = xd->mi_row;
-    const int mi_col = xd->mi_col;
-    const int within_border =
-        mi_row >= xd->tile.mi_row_start &&
-        (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
-        mi_col >= xd->tile.mi_col_start &&
-        (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
-    if (within_border &&
-        is_intra_hash_match(cpi, x, plane, blk_row, blk_col, plane_bsize,
-                            tx_size, txb_ctx, &intra_txb_rd_info,
-                            tx_type_map_idx, &cur_joint_ctx)) {
-      best_rd_stats->rate = intra_txb_rd_info->rate;
-      best_rd_stats->dist = intra_txb_rd_info->dist;
-      best_rd_stats->sse = intra_txb_rd_info->sse;
-      best_rd_stats->skip_txfm = intra_txb_rd_info->eob == 0;
-      x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
-      x->plane[plane].txb_entropy_ctx[block] =
-          intra_txb_rd_info->txb_entropy_ctx;
-      best_eob = intra_txb_rd_info->eob;
-      best_tx_type = intra_txb_rd_info->tx_type;
-      skip_trellis |= !intra_txb_rd_info->perform_block_coeff_opt;
-      update_txk_array(xd, blk_row, blk_col, tx_size, best_tx_type);
-      recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                  txb_ctx, skip_trellis, best_tx_type, 1, &rate_cost, best_eob);
-      p->dqcoeff = orig_dqcoeff;
-      return;
-    }
-  }
-
   uint8_t best_txb_ctx = 0;
   // txk_allowed = TX_TYPES: >1 tx types are allowed
   // txk_allowed < TX_TYPES: only that specific tx type is allowed.
@@ -2311,7 +2004,11 @@
     predict_dc_only_block(x, plane, plane_bsize, tx_size, block, blk_row,
                           blk_col, best_rd_stats, &block_sse, &block_mse_q8,
                           &per_px_mean, &dc_only_blk);
-    if (best_rd_stats->skip_txfm == 1) return;
+    if (best_rd_stats->skip_txfm == 1) {
+      const TX_TYPE tx_type = DCT_DCT;
+      if (plane == 0) xd->tx_type_map[tx_type_map_idx] = tx_type;
+      return;
+    }
   } else {
     block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize,
                                 txsize_to_bsize[tx_size], &block_mse_q8);
@@ -2383,7 +2080,8 @@
   // Iterate through all transform type candidates.
   for (int idx = 0; idx < TX_TYPES; ++idx) {
     const TX_TYPE tx_type = (TX_TYPE)txk_map[idx];
-    if (!(allowed_tx_mask & (1 << tx_type))) continue;
+    if (tx_type == TX_TYPE_INVALID || !check_bit_mask(allowed_tx_mask, tx_type))
+      continue;
     txfm_param.tx_type = tx_type;
     if (av1_use_qmatrix(&cm->quant_params, xd, mbmi->segment_id)) {
       av1_setup_qmatrix(&cm->quant_params, xd, plane, tx_size, tx_type,
@@ -2557,18 +2255,6 @@
     best_rd_stats->sse = block_sse;
   }
 
-  if (intra_txb_rd_info != NULL) {
-    intra_txb_rd_info->valid = 1;
-    intra_txb_rd_info->entropy_context = cur_joint_ctx;
-    intra_txb_rd_info->rate = best_rd_stats->rate;
-    intra_txb_rd_info->dist = best_rd_stats->dist;
-    intra_txb_rd_info->sse = best_rd_stats->sse;
-    intra_txb_rd_info->eob = best_eob;
-    intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
-    intra_txb_rd_info->perform_block_coeff_opt = perform_block_coeff_opt;
-    if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
-  }
-
   // Intra mode needs decoded pixels such that the next transform block
   // can use them for prediction.
   recon_intra(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -2583,51 +2269,13 @@
                                   int block, int plane_bsize, TXB_CTX *txb_ctx,
                                   RD_STATS *rd_stats,
                                   FAST_TX_SEARCH_MODE ftxs_mode,
-                                  int64_t ref_rdcost,
-                                  TXB_RD_INFO *rd_info_array) {
-  const struct macroblock_plane *const p = &x->plane[0];
-  const uint16_t cur_joint_ctx =
-      (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
-  MACROBLOCKD *xd = &x->e_mbd;
-  assert(is_inter_block(xd->mi[0]));
-  const int tx_type_map_idx = blk_row * xd->tx_type_map_stride + blk_col;
-  // Look up RD and terminate early in case when we've already processed exactly
-  // the same residue with exactly the same entropy context.
-  if (rd_info_array != NULL && rd_info_array->valid &&
-      rd_info_array->entropy_context == cur_joint_ctx) {
-    xd->tx_type_map[tx_type_map_idx] = rd_info_array->tx_type;
-    const TX_TYPE ref_tx_type =
-        av1_get_tx_type(&x->e_mbd, get_plane_type(0), blk_row, blk_col, tx_size,
-                        cpi->common.features.reduced_tx_set_used);
-    if (ref_tx_type == rd_info_array->tx_type) {
-      rd_stats->rate += rd_info_array->rate;
-      rd_stats->dist += rd_info_array->dist;
-      rd_stats->sse += rd_info_array->sse;
-      rd_stats->skip_txfm &= rd_info_array->eob == 0;
-      p->eobs[block] = rd_info_array->eob;
-      p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
-      return;
-    }
-  }
-
+                                  int64_t ref_rdcost) {
   RD_STATS this_rd_stats;
   const int skip_trellis = 0;
   search_tx_type(cpi, x, 0, block, blk_row, blk_col, plane_bsize, tx_size,
                  txb_ctx, ftxs_mode, skip_trellis, ref_rdcost, &this_rd_stats);
 
   av1_merge_rd_stats(rd_stats, &this_rd_stats);
-
-  // Save RD results for possible reuse in future.
-  if (rd_info_array != NULL) {
-    rd_info_array->valid = 1;
-    rd_info_array->entropy_context = cur_joint_ctx;
-    rd_info_array->rate = this_rd_stats.rate;
-    rd_info_array->dist = this_rd_stats.dist;
-    rd_info_array->sse = this_rd_stats.sse;
-    rd_info_array->eob = p->eobs[block];
-    rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
-    rd_info_array->tx_type = xd->tx_type_map[tx_type_map_idx];
-  }
 }
 
 static AOM_INLINE void try_tx_block_no_split(
@@ -2635,8 +2283,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
     const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
     int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    TxCandidateInfo *no_split) {
+    FAST_TX_SEARCH_MODE ftxs_mode, TxCandidateInfo *no_split) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = xd->mi[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -2652,8 +2299,7 @@
   const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
   mbmi->inter_tx_size[index] = tx_size;
   tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-             rd_stats, ftxs_mode, ref_best_rd,
-             rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+             rd_stats, ftxs_mode, ref_best_rd);
   assert(rd_stats->rate < INT_MAX);
 
   const int pick_skip_txfm =
@@ -2663,8 +2309,7 @@
            RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse));
   if (pick_skip_txfm) {
 #if CONFIG_RD_DEBUG
-    update_txb_coeff_cost(rd_stats, 0, blk_row, blk_col,
-                          zero_blk_rate - rd_stats->rate);
+    update_txb_coeff_cost(rd_stats, 0, zero_blk_rate - rd_stats->rate);
 #endif  // CONFIG_RD_DEBUG
     rd_stats->rate = zero_blk_rate;
     rd_stats->dist = rd_stats->sse;
@@ -2689,8 +2334,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
-    FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
-    RD_STATS *split_rd_stats) {
+    FAST_TX_SEARCH_MODE ftxs_mode, RD_STATS *split_rd_stats) {
   assert(tx_size < TX_SIZES_ALL);
   MACROBLOCKD *const xd = &x->e_mbd;
   const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
@@ -2718,11 +2362,10 @@
 
       RD_STATS this_rd_stats;
       int this_cost_valid = 1;
-      select_tx_block(
-          cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
-          tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
-          ref_best_rd - split_rd_stats->rdcost, &this_cost_valid, ftxs_mode,
-          (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+      select_tx_block(cpi, x, offsetr, offsetc, block, sub_txs, depth + 1,
+                      plane_bsize, ta, tl, tx_above, tx_left, &this_rd_stats,
+                      no_split_rd / nblks, ref_best_rd - split_rd_stats->rdcost,
+                      &this_cost_valid, ftxs_mode);
       if (!this_cost_valid) {
         split_rd_stats->rdcost = INT64_MAX;
         return;
@@ -2770,7 +2413,6 @@
       total_x_sum += x_sum;
       total_x2_sum += x2_sum;
 
-      aom_clear_system_state();
       const float mean = (float)x_sum / sub_num;
       const float var = get_var(mean, (double)x2_sum, sub_num);
       mean_sum += mean;
@@ -2807,7 +2449,6 @@
       x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
-  aom_clear_system_state();
   float dev_of_means = 0.0f;
   float var_of_vars = 0.0f;
 
@@ -2839,8 +2480,7 @@
     TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
     ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
     RD_STATS *rd_stats, int64_t prev_level_rd, int64_t ref_best_rd,
-    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
-    TXB_RD_INFO_NODE *rd_info_node) {
+    int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode) {
   assert(tx_size < TX_SIZES_ALL);
   av1_init_rd_stats(rd_stats);
   if (ref_best_rd < 0) {
@@ -2871,11 +2511,15 @@
                             cpi->sf.tx_sf.prune_tx_size_level);
   }
 
+  if (cpi->sf.rt_sf.skip_tx_no_split_var_based_partition) {
+    if (x->try_merge_partition && try_split && p->eobs[block]) try_no_split = 0;
+  }
+
   // Try using current block as a single transform block without split.
   if (try_no_split) {
     try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                           plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
-                          ftxs_mode, rd_info_node, &no_split);
+                          ftxs_mode, &no_split);
 
     // Speed features for early termination.
     const int search_level = cpi->sf.tx_sf.adaptive_txb_search_level;
@@ -2911,7 +2555,7 @@
     try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
                        plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
                        AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
-                       rd_info_node, &split_rd_stats);
+                       &split_rd_stats);
   }
 
   if (no_split.rd < split_rd_stats.rdcost) {
@@ -3074,6 +2718,10 @@
     init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
                                        is_inter_block(mbmi), &cpi->sf,
                                        txfm_params->tx_size_search_method);
+    if (init_depth == MAX_TX_DEPTH && !cpi->oxcf.txfm_cfg.enable_tx64 &&
+        txsize_sqr_up_map[start_tx] == TX_64X64) {
+      start_tx = sub_tx_size_map[start_tx];
+    }
   } else {
     const TX_SIZE chosen_tx_size =
         tx_size_from_tx_mode(bs, txfm_params->tx_mode_search_type);
@@ -3163,8 +2811,7 @@
   }
 
 #if CONFIG_RD_DEBUG
-  update_txb_coeff_cost(&this_rd_stats, plane, blk_row, blk_col,
-                        this_rd_stats.rate);
+  update_txb_coeff_cost(&this_rd_stats, plane, this_rd_stats.rate);
 #endif  // CONFIG_RD_DEBUG
   av1_set_txb_context(x, plane, block, tx_size, a, l);
 
@@ -3415,7 +3062,7 @@
             .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
     rd_stats->zero_rate = zero_blk_rate;
     tx_type_rd(cpi, x, tx_size, blk_row, blk_col, block, plane_bsize, &txb_ctx,
-               rd_stats, ftxs_mode, ref_best_rd, NULL);
+               rd_stats, ftxs_mode, ref_best_rd);
     const int mi_width = mi_size_wide[plane_bsize];
     TxfmSearchInfo *txfm_info = &x->txfm_search_info;
     if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
@@ -3554,8 +3201,7 @@
 // will be saved in rd_stats. The returned value is the corresponding RD cost.
 static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
                                        RD_STATS *rd_stats, BLOCK_SIZE bsize,
-                                       int64_t ref_best_rd,
-                                       TXB_RD_INFO_NODE *rd_info_tree) {
+                                       int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const TxfmSearchParams *txfm_params = &x->txfm_search_params;
   assert(is_inter_block(xd->mi[0]));
@@ -3608,7 +3254,7 @@
       // Search for the best transform block size and type for the sub-block.
       select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, bsize,
                       ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, INT64_MAX,
-                      best_rd_sofar, &is_cost_valid, ftxs_mode, rd_info_tree);
+                      best_rd_sofar, &is_cost_valid, ftxs_mode);
       if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
         av1_invalid_rd_stats(rd_stats);
         return INT64_MAX;
@@ -3618,7 +3264,6 @@
       no_skip_txfm_rd =
           RDCOST(x->rdmult, rd_stats->rate + no_skip_txfm_cost, rd_stats->dist);
       block += step;
-      if (rd_info_tree != NULL) rd_info_tree += 1;
     }
   }
 
@@ -3729,19 +3374,8 @@
   ++x->txfm_search_info.tx_search_count;
 #endif  // CONFIG_SPEED_STATS
 
-  // Pre-compute residue hashes (transform block level) and find existing or
-  // add new RD records to store and reuse rate and distortion values to speed
-  // up TX size/type search.
-  TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
-  int found_rd_info = 0;
-  if (ref_best_rd != INT64_MAX && within_border &&
-      cpi->sf.tx_sf.use_inter_txb_hash) {
-    found_rd_info = find_tx_size_rd_records(x, bsize, matched_rd_info);
-  }
-
   const int64_t rd =
-      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd,
-                              found_rd_info ? matched_rd_info : NULL);
+      select_tx_size_and_type(cpi, x, rd_stats, bsize, ref_best_rd);
 
   if (rd == INT64_MAX) {
     // We should always find at least one candidate unless ref_best_rd is less

diff --git a/av1/encoder/txb_rdopt.c b/av1/encoder/txb_rdopt.c
index 31b86ab..77bc3cd 100644
--- a/av1/encoder/txb_rdopt.c
+++ b/av1/encoder/txb_rdopt.c

@@ -155,6 +155,7 @@
     tran_low_t abs_qc_low;
     int64_t dist_low, rd_low;
     int rate_low;
+
     if (abs_qc == 1) {
       abs_qc_low = 0;
       dqc_low = qc_low = 0;
@@ -199,11 +200,13 @@
       }
     }
 
-    if (rd_low < rd) {
-      lower_level = 1;
-      rd = rd_low;
-      rate = rate_low;
-      dist = dist_low;
+    if (sharpness == 0 || abs_qc > 1) {
+      if (rd_low < rd) {
+        lower_level = 1;
+        rd = rd_low;
+        rate = rate_low;
+        dist = dist_low;
+      }
     }
 
     if (sharpness == 0 && rd_new_eob < rd) {
@@ -238,11 +241,10 @@
 static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
                                int nz_num, int *nz_ci, int64_t rdmult,
                                int skip_cost, int non_skip_cost,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               int sharpness) {
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff) {
   const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
   const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
-  if (sharpness == 0 && rd_new_eob < rd) {
+  if (rd_new_eob < rd) {
     for (int i = 0; i < nz_num; ++i) {
       const int ci = nz_ci[i];
       qcoeff[ci] = 0;
@@ -327,16 +329,8 @@
   const LV_MAP_EOB_COST *txb_eob_costs =
       &coeff_costs->eob_costs[eob_multi_size][plane_type];
 
-  const int rshift =
-      (sharpness +
-       (cpi->oxcf.q_cfg.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
-            ? 7 - mbmi->segment_id
-            : 2) +
-       (cpi->oxcf.q_cfg.aq_mode != VARIANCE_AQ &&
-                cpi->oxcf.q_cfg.deltaq_mode == DELTA_Q_PERCEPTUAL &&
-                cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
-            ? (3 - x->sb_energy_level)
-            : 0));
+  const int rshift = 2;
+
   const int64_t rdmult =
       (((int64_t)x->rdmult *
         (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
@@ -401,9 +395,9 @@
     default: assert(false);
   }
 
-  if (si == -1 && nz_num <= max_nz_num) {
+  if (si == -1 && nz_num <= max_nz_num && sharpness == 0) {
     update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
-                non_skip_cost, qcoeff, dqcoeff, sharpness);
+                non_skip_cost, qcoeff, dqcoeff);
   }
 
 #define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \

diff --git a/av1/encoder/txb_rdopt.h b/av1/encoder/txb_rdopt.h
index e86caaa..70b322a 100644
--- a/av1/encoder/txb_rdopt.h
+++ b/av1/encoder/txb_rdopt.h

@@ -44,11 +44,11 @@
  * skip flag (tx_skip) and the sign of DC coefficient (dc_sign).
  * \param[out]   rate_cost      The entropy cost of coding the transform block
  * after adjustment of coefficients.
- * \param[in]    sharpness      When sharpness == 1, the function will be less
- * aggressive toward lowering the magnitude of coefficients.
+ * \param[in]    sharpness      When sharpness > 0, the function will be less
+ * aggressive towards lowering the magnitude of coefficients.
  * In this way, the transform block will contain more high-frequency
- coefficients
- * and therefore preserve the sharpness of the reconstructed block.
+ * coefficients and therefore will preserve the sharpness of the reconstructed
+ * block.
  */
 int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
                      int block, TX_SIZE tx_size, TX_TYPE tx_type,

diff --git a/av1/encoder/use_flat_gop_model_params.h b/av1/encoder/use_flat_gop_model_params.h
deleted file mode 100644
index cf07766..0000000
--- a/av1/encoder/use_flat_gop_model_params.h
+++ /dev/null

@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
-#define AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "av1/encoder/ml.h"
-
-// A binary classifier that returns true (score > 0) if it is better to use a
-// flat GOP structure, rather than a GOP structure that uses ALT-REFs and
-// internal ARFs.
-
-#define NUM_FEATURES 21
-#define NUM_HIDDEN_LAYERS 1
-#define NUM_HIDDEN_NODES_LAYER0 48
-#define NUM_LABELS 1
-
-static const float
-    av1_use_flat_gop_nn_weights_layer0[NUM_FEATURES *
-                                       NUM_HIDDEN_NODES_LAYER0] = {
-      0.3801f,  -2.1832f, 1.7469f,  2.0130f,  2.1264f,  -0.7293f, -0.2814f,
-      0.0692f,  -4.6589f, -1.4591f, 0.3023f,  -0.4310f, -0.1911f, -0.8284f,
-      -1.3322f, -0.4621f, -0.1148f, -0.3531f, -0.0794f, -0.3114f, -0.1664f,
-      -0.1615f, 0.2913f,  -0.0394f, -0.0620f, 0.1845f,  0.0204f,  -0.2124f,
-      -0.1233f, -0.1685f, 0.1215f,  -0.2372f, -0.2865f, -0.1976f, 0.2137f,
-      -0.1318f, -0.0324f, 0.0415f,  -0.1172f, 0.1077f,  -0.1135f, -0.2462f,
-      -0.0743f, -0.1584f, -0.3267f, -0.0566f, -0.1615f, -0.3931f, -0.5200f,
-      -0.1786f, -0.1811f, -0.2812f, -0.1986f, -0.4393f, -0.3941f, -0.2500f,
-      -0.2029f, -0.4605f, -0.4973f, -0.2238f, -0.2599f, -0.1951f, -0.2034f,
-      -0.3186f, -0.1368f, -0.5076f, -0.4718f, -0.1815f, -0.3338f, -0.0550f,
-      -0.3920f, -0.5328f, -0.1658f, -0.2194f, -0.2867f, -0.0916f, -0.1678f,
-      -0.1760f, -0.5055f, -0.2322f, -0.4668f, -0.0121f, -0.3903f, -0.2721f,
-      -0.1306f, 0.1199f,  0.2894f,  0.1098f,  -0.0155f, -0.0844f, 0.0421f,
-      -0.2364f, -0.1073f, -0.0878f, -0.2146f, -0.1713f, -0.2283f, 0.0342f,
-      0.0394f,  -0.2808f, -0.0048f, 0.2640f,  -0.1371f, 0.1709f,  0.0155f,
-      -0.3614f, -0.1843f, -0.3215f, -0.3121f, -0.2609f, -0.0254f, -0.2474f,
-      -0.4674f, -0.3674f, -0.2076f, 0.0149f,  -0.3304f, -0.2678f, -0.0465f,
-      -0.1326f, -0.4504f, -0.5101f, -0.1280f, -0.0416f, -0.4296f, -0.4568f,
-      -0.6762f, -2.8105f, 0.7249f,  1.4288f,  1.3731f,  0.3034f,  0.1841f,
-      -0.0912f, -0.1508f, 1.2637f,  -0.2009f, 0.3236f,  -0.2500f, -0.0736f,
-      0.8655f,  -0.2599f, 0.1150f,  -0.0368f, -0.1122f, -0.7650f, -0.2004f,
-      -0.0891f, -0.3832f, -0.2576f, -0.3532f, -0.1735f, -0.4018f, -0.0265f,
-      -0.2988f, 0.2555f,  -0.1041f, -0.3391f, -0.5316f, -0.0171f, -0.3232f,
-      -0.0565f, -0.3359f, -0.1842f, -0.0582f, 0.0073f,  -0.0278f, -0.5517f,
-      0.0892f,  -0.1354f, 0.0548f,  -0.0401f, -0.1697f, 0.0432f,  0.0832f,
-      -0.3538f, 0.2602f,  -0.0066f, -0.2130f, -0.3085f, 0.0025f,  0.2464f,
-      -0.0103f, -0.3082f, -0.1136f, -0.2359f, -0.3421f, 0.1335f,  -0.3016f,
-      -1.0355f, -1.0572f, -0.3316f, -0.1235f, -0.3730f, -0.1751f, -0.1921f,
-      0.0031f,  -0.6297f, -0.5179f, 0.1082f,  -0.3130f, -0.1120f, -0.5430f,
-      -0.1782f, 0.0534f,  -0.1052f, 0.1471f,  -0.7156f, -0.5453f, -0.5437f,
-      1.8709f,  1.9696f,  -1.0343f, -0.3150f, -0.8399f, -0.0052f, -0.1123f,
-      -0.1059f, 0.6755f,  1.2593f,  -0.2512f, -0.2053f, 0.0835f,  0.3261f,
-      -0.0172f, 0.1230f,  -0.3687f, 0.1993f,  0.9390f,  -0.0165f, 0.6856f,
-      -0.4372f, -0.4041f, -0.2869f, -0.3871f, -0.3587f, -0.2418f, 0.0518f,
-      0.0110f,  -1.4713f, -0.1307f, -0.3246f, -0.5091f, -0.4652f, -0.4288f,
-      -0.0763f, -0.1755f, 0.0662f,  -0.3026f, -0.4462f, -0.4123f, -0.2891f,
-      -0.2251f, -0.4925f, -0.3820f, -0.1840f, -0.2878f, -0.1973f, -0.1010f,
-      -0.1622f, -0.3108f, -0.5292f, -0.1017f, -0.0607f, -0.2426f, -0.6406f,
-      -0.3834f, -0.2313f, -0.2433f, -0.1773f, -0.1581f, -0.3295f, -0.3799f,
-      -0.4447f, -0.2389f, -0.4231f, -0.1498f, -0.0181f, -0.4429f, -0.3515f,
-      0.0425f,  -0.5280f, -0.3462f, -0.3659f, 0.0153f,  -0.1002f, -0.5057f,
-      -0.2134f, -0.2859f, -0.1988f, -0.4758f, 0.0967f,  -0.4784f, 0.1868f,
-      -0.4387f, -1.3376f, -0.4452f, 0.3837f,  0.1698f,  -0.7076f, -0.4320f,
-      0.0382f,  -1.8053f, -0.6589f, 0.1406f,  -0.4340f, 0.0641f,  -0.2558f,
-      -0.4496f, -0.5003f, -0.6241f, -0.2217f, -0.8312f, -0.6793f, -0.3563f,
-      0.5153f,  -0.7851f, 1.0570f,  0.9702f,  0.5238f,  -0.6932f, -0.4443f,
-      0.0407f,  -3.0961f, -0.8461f, 0.0562f,  -0.0642f, 0.2471f,  -0.5911f,
-      -0.7715f, -0.1574f, -0.0375f, -0.1951f, -0.3097f, -0.2040f, 0.0128f,
-      -0.0918f, -0.0698f, -0.0970f, -0.2946f, -0.1723f, -0.2569f, -0.4382f,
-      -0.5174f, -0.2058f, -0.2973f, -0.0858f, -0.2526f, -0.2648f, -0.2339f,
-      -0.3474f, 0.0607f,  0.0272f,  -0.3142f, -0.1306f, -0.4938f, -0.1894f,
-      -0.0551f, -0.1061f, -0.1613f, -0.1942f, 0.0590f,  -0.2009f, -0.1286f,
-      -0.2035f, -0.0393f, -0.0650f, -0.1110f, 0.0123f,  -0.1122f, -0.0246f,
-      -0.2042f, 0.0411f,  -0.2771f, -0.0189f, 0.0927f,  0.0286f,  -0.1559f,
-      -0.3217f, -0.1039f, 0.1471f,  0.2489f,  0.2085f,  -0.4199f, -0.2404f,
-      0.0358f,  -0.7567f, -0.2413f, -0.3437f, -0.2433f, -0.3687f, -0.1194f,
-      -0.4289f, -0.1138f, -0.0721f, -0.3461f, -0.0244f, -0.3530f, -0.2842f,
-      -0.3823f, -0.1238f, -0.5475f, -0.2688f, -0.0073f, 0.0491f,  -0.4500f,
-      0.0201f,  0.0303f,  -0.2160f, -0.4219f, -0.4831f, -0.4593f, -0.2304f,
-      -0.2082f, -0.0367f, -0.5226f, -0.0082f, -0.1867f, -0.1812f, -0.2753f,
-      2.6650f,  1.9698f,  -2.9425f, 1.2119f,  1.5000f,  0.3356f,  0.3905f,
-      -0.2006f, -1.4038f, -1.0917f, 0.1423f,  -0.3528f, 0.0888f,  0.5802f,
-      1.0977f,  0.1083f,  -0.0693f, -0.0784f, 0.4247f,  0.4108f,  0.4970f,
-      -0.7290f, -0.1659f, -0.0517f, 0.0776f,  -0.0550f, -0.2374f, -0.4245f,
-      -0.0165f, -0.6804f, -0.3211f, -0.3101f, -0.1883f, -0.0786f, -0.3971f,
-      -0.4130f, -0.0606f, 0.1432f,  -0.0518f, -0.4179f, -0.4949f, -0.3451f,
-      -0.7559f, -4.0792f, 1.5526f,  0.2824f,  0.6086f,  -0.2148f, 0.0959f,
-      0.0506f,  -5.5176f, -3.9702f, 0.1597f,  -0.1760f, -0.0627f, 0.1657f,
-      -1.2996f, -0.2899f, -0.0600f, -0.0531f, -1.5160f, -0.4837f, -1.6961f,
-      -0.1134f, -0.1838f, -0.3071f, -0.4215f, -0.4184f, 0.0192f,  -0.2128f,
-      -0.3094f, -0.2607f, -0.4855f, -0.1881f, 0.0258f,  -0.5085f, -0.3630f,
-      -0.4824f, -0.3762f, -0.3324f, -0.1134f, -0.3350f, 0.0217f,  -0.2803f,
-      -0.5669f, -0.5674f, -0.5441f, -0.5965f, -0.3062f, -0.4666f, -0.4079f,
-      -0.0065f, -0.7566f, -0.3437f, -0.2474f, -0.2360f, -0.5683f, -0.3853f,
-      -0.6670f, -0.4158f, -0.2831f, -0.3327f, -0.7419f, -0.6481f, -0.4004f,
-      -0.4025f, -0.6405f, -0.4265f, -0.0167f, 0.3195f,  -0.0822f, -0.4350f,
-      -0.0032f, -1.0448f, -0.4407f, 0.0488f,  0.0776f,  -0.3828f, -0.3380f,
-      -0.2983f, -0.2220f, -0.4105f, -0.2312f, -0.4166f, -0.3258f, -0.1424f,
-      -0.6588f, -0.9433f, 0.3402f,  0.5800f,  0.6368f,  -0.4298f, -0.5743f,
-      0.0822f,  -1.0843f, -0.1645f, -0.1990f, 0.0255f,  -0.1039f, -0.3673f,
-      0.4367f,  -0.5491f, -0.0932f, -0.0323f, -0.2405f, -0.2922f, -0.4019f,
-      -0.4936f, -1.2338f, 0.4681f,  0.7454f,  0.8181f,  -0.3680f, -0.1613f,
-      -0.0008f, -1.3326f, -0.0667f, 0.1569f,  -0.0978f, -0.3229f, -0.4222f,
-      0.0330f,  0.1064f,  -0.1325f, 0.0121f,  -0.3976f, -0.2254f, -0.3942f,
-      -0.4771f, -0.1887f, 0.1020f,  0.3331f,  0.3098f,  -0.1256f, -0.4736f,
-      0.0295f,  -0.3919f, -0.0931f, -0.2484f, -0.4629f, -0.2800f, -0.2851f,
-      -0.2243f, -0.3958f, -0.3053f, -0.6585f, -0.1159f, -0.2330f, -0.1989f,
-      0.2273f,  0.1963f,  0.0283f,  0.0198f,  -0.1298f, -0.0627f, -0.2753f,
-      -0.1552f, 0.2734f,  -0.0551f, -0.2927f, -0.3772f, -0.4522f, -0.0786f,
-      0.0079f,  0.1664f,  -0.0228f, -0.2908f, -0.1714f, 0.1223f,  -0.0680f,
-      -0.5048f, -0.0852f, -0.4653f, -0.5142f, -0.1818f, -0.1659f, 0.0678f,
-      -0.1296f, 0.0295f,  -0.3487f, -0.1224f, -0.2690f, -0.3217f, -0.1957f,
-      -0.3196f, -0.4530f, -0.1746f, -0.2307f, -0.0504f, -0.0131f, -0.4613f,
-      -0.1476f, -0.5596f, -0.3829f, -0.4302f, -0.2910f, -0.2182f, -0.0811f,
-      -0.3967f, -0.3912f, -0.0371f, -0.1109f, -0.0793f, -0.2063f, -0.0060f,
-      -0.0236f, -0.4098f, -0.0276f, -0.3352f, -0.1888f, -0.2439f, -0.3748f,
-      0.0371f,  0.8460f,  -0.5547f, -1.2680f, -1.1623f, -0.1740f, -0.4815f,
-      -0.0294f, 4.4764f,  0.3716f,  -0.2826f, -0.0549f, -0.2937f, 0.0632f,
-      0.0686f,  -0.4681f, -0.2555f, -0.2427f, -0.2261f, -0.1567f, -0.5199f,
-      -0.4079f, -0.0801f, -0.2075f, -0.3956f, -0.0307f, -0.3150f, -0.3490f,
-      -0.0379f, 0.3060f,  -0.1775f, -0.1651f, 0.0677f,  -0.1947f, 0.0032f,
-      -0.2014f, -0.1575f, -0.1289f, -0.0250f, -0.0762f, -0.2324f, -0.2895f,
-      -0.4531f, -0.4601f, -0.1718f, -0.3139f, -0.4350f, 0.0346f,  -0.0891f,
-      -0.1581f, 0.2123f,  -0.1074f, 0.0221f,  0.0951f,  0.1161f,  0.0245f,
-      -0.0701f, -0.1677f, -0.4170f, -0.2214f, -0.3419f, -0.4873f, -0.0701f,
-      -0.0613f, -0.1031f, 0.0141f,  -0.1299f, -0.3953f, -0.2182f, -0.2679f,
-      -0.0141f, 0.3392f,  -0.0722f, -0.2390f, 0.1638f,  -0.1596f, -0.1527f,
-      -0.3581f, -0.4037f, -0.0736f, 0.0397f,  -0.1288f, -0.1362f, -0.0249f,
-      -0.5099f, -0.4040f, -0.1893f, -0.0298f, -0.1332f, -0.1693f, -0.3301f,
-      -0.1058f, -0.1414f, -0.5737f, -0.2342f, -0.2560f, -0.3834f, -0.0917f,
-      -0.1334f, -0.5077f, -0.3666f, -0.2515f, -0.4824f, -0.4714f, -0.5723f,
-      -0.1361f, -0.5244f, -0.2468f, 0.0237f,  -0.1862f, -0.3124f, -0.0183f,
-      -0.4662f, -0.4444f, -0.5400f, -0.1730f, -0.0123f, -0.2134f, -0.1024f,
-      -0.0172f, -0.4430f, -0.1403f, -0.0751f, -0.2403f, -0.2100f, -0.0678f,
-      2.4232f,  1.9825f,  0.1260f,  1.9972f,  2.8061f,  0.3916f,  0.1842f,
-      -0.2603f, -1.6092f, -1.6037f, 0.1475f,  0.0516f,  -0.2593f, 0.0359f,
-      -0.1802f, 0.0159f,  -0.0529f, -0.0983f, 0.7638f,  0.5529f,  0.9662f,
-      -0.4049f, -0.6372f, 0.4907f,  0.7360f,  0.9271f,  -0.6879f, -0.1067f,
-      0.0323f,  -1.8447f, 0.2176f,  -0.1047f, -0.0048f, -0.1031f, -0.7931f,
-      -0.3059f, -0.4595f, -0.1287f, -0.4031f, 0.1441f,  -0.6651f, 0.2530f,
-      -0.4572f, -0.0614f, 0.0345f,  -0.0008f, 0.0333f,  -0.3431f, 0.0538f,
-      -0.2691f, 0.2930f,  -0.0820f, -0.0979f, -0.0307f, 0.1713f,  0.0783f,
-      -0.4337f, -0.2702f, -0.1677f, -0.1719f, -0.4669f, -0.2847f, -0.4495f,
-      -0.3692f, -0.2641f, -0.2833f, -0.1168f, -0.0523f, -0.2368f, -0.4922f,
-      -0.3453f, -0.4452f, -0.5212f, 0.0412f,  -0.3310f, -0.2656f, -0.4903f,
-      -0.3854f, -0.1009f, -0.1038f, -0.2350f, -0.4430f, -0.5097f, -0.1755f,
-      0.0110f,  -0.0712f, -0.0662f, -0.4493f, -0.2111f, -0.3402f, -0.3100f,
-      -0.2525f, -0.1856f, -0.2689f, -0.4288f, -0.3912f, -0.0754f, -0.5191f,
-      -0.0747f, -0.0626f, -0.4821f, -0.2014f, -0.3124f, -0.4858f, -0.1896f,
-      1.0673f,  -0.8529f, 13.7564f, 18.7299f, 19.0062f, -1.1047f, -0.8654f,
-      0.1089f,  -1.2958f, -0.7793f, 0.0780f,  -0.1679f, 0.0054f,  -1.2451f,
-      -0.1287f, 0.0082f,  -0.2960f, -0.0442f, 2.3817f,  0.4716f,  1.3862f,
-      -0.0782f, -0.1871f, -0.2596f, 0.0093f,  0.1451f,  -0.1124f, -0.2315f,
-      -0.2677f, -0.1086f, 0.2216f,  0.2928f,  0.0391f,  0.0372f,  -0.2551f,
-      0.0552f,  -0.1876f, -0.2361f, -0.1889f, -0.0279f, 0.1204f,  0.2016f,
-      -0.5787f, -0.5830f, 0.0530f,  -0.1452f, -0.4899f, -0.2937f, 0.1430f,
-      -0.2752f, -0.2320f, -0.1908f, -0.5538f, -0.0858f, -0.1378f, -0.1505f,
-      -0.3908f, -0.4732f, -0.3018f, 0.0244f,  -0.2392f, -0.2833f, -0.3997f,
-      -0.4495f, -0.2570f, -0.3189f, -0.1534f, -0.1040f, -0.5497f, -0.3524f,
-      -0.2053f, 0.2415f,  -0.5027f, 0.0288f,  -0.1904f, -0.2183f, -0.1062f,
-      -0.3560f, 0.0165f,  -0.4601f, -0.2144f, -0.0439f, -0.4913f, -0.3160f,
-      -0.1641f, 0.1010f,  -0.1044f, -0.4064f, -0.3580f, -0.4015f, 0.1010f,
-      -0.1973f, 0.6392f,  -0.5177f, -0.0472f, -0.1526f, 0.1533f,  -0.0819f,
-      -0.0252f, -0.0783f, 0.1301f,  0.0158f,  -0.2003f, -0.4700f, -0.2329f,
-    };
-
-static const float
-    av1_use_flat_gop_nn_biases_layer0[NUM_HIDDEN_NODES_LAYER0] = {
-      -1.113218f, 0.f,        -0.268537f, -0.268537f, 0.f,        -0.268534f,
-      -0.40681f,  -0.268537f, -0.061835f, -0.614956f, 0.984277f,  -0.280228f,
-      -0.354716f, -0.202312f, -0.772829f, -0.464005f, -0.230795f, 0.f,
-      -0.124187f, -0.265949f, 0.325168f,  -0.359008f, -2.455546f, -0.229222f,
-      -0.692233f, -0.29401f,  -0.632682f, -0.479061f, -0.166094f, 0.077291f,
-      -0.235293f, -0.268537f, 0.167899f,  -0.141991f, -0.210089f, -0.177294f,
-      -0.325401f, -0.268537f, 0.323627f,  -0.156593f, -0.218451f, -0.230792f,
-      -0.268537f, 0.833177f,  0.f,        -0.353177f, -0.260953f, -0.209537f,
-    };
-
-static const float
-    av1_use_flat_gop_nn_weights_layer1[NUM_HIDDEN_NODES_LAYER0 * NUM_LABELS] = {
-      -0.024695f, 0.146668f,  -0.02723f,  0.034577f,  -0.255426f, 0.22402f,
-      -0.112595f, -0.131262f, 0.091164f,  -0.045294f, 0.028304f,  -0.051683f,
-      0.310497f,  -0.077786f, -0.047873f, -0.057205f, -0.065119f, 0.227417f,
-      -0.051126f, -0.137241f, 0.035742f,  -0.058992f, -0.021466f, 0.107947f,
-      -0.077183f, -0.04144f,  0.003568f,  -0.027656f, 0.038196f,  0.19684f,
-      -0.128401f, 0.149629f,  0.024526f,  0.037376f,  0.090752f,  -0.061666f,
-      -0.15743f,  0.057773f,  -0.010582f, 0.120997f,  0.060368f,  0.210028f,
-      -0.192244f, -0.064764f, -0.237655f, 0.1852f,    -0.084281f, -0.010434f,
-    };
-
-static const float av1_use_flat_gop_nn_biases_layer1[NUM_LABELS] = {
-  -0.672434f,
-};
-
-static const NN_CONFIG av1_use_flat_gop_nn_config = {
-  NUM_FEATURES,
-  NUM_LABELS,
-  NUM_HIDDEN_LAYERS,
-  {
-      NUM_HIDDEN_NODES_LAYER0,
-  },
-  {
-      av1_use_flat_gop_nn_weights_layer0,
-      av1_use_flat_gop_nn_weights_layer1,
-  },
-  {
-      av1_use_flat_gop_nn_biases_layer0,
-      av1_use_flat_gop_nn_biases_layer1,
-  },
-};
-
-#undef NUM_FEATURES
-#undef NUM_HIDDEN_LAYERS
-#undef NUM_HIDDEN_NODES_LAYER0
-#undef NUM_LABELS
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AV1_ENCODER_USE_FLAT_GOP_MODEL_PARAMS_H_

diff --git a/av1/encoder/var_based_part.c b/av1/encoder/var_based_part.c
index a42be45..74a409f 100644
--- a/av1/encoder/var_based_part.c
+++ b/av1/encoder/var_based_part.c

@@ -22,7 +22,6 @@
 #include "aom_dsp/binary_codes_writer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_timer.h"
-#include "aom_ports/system_state.h"
 
 #include "av1/common/reconinter.h"
 #include "av1/common/blockd.h"
@@ -341,7 +340,7 @@
 
 static AOM_INLINE void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[],
                                           int q, int content_lowsumdiff,
-                                          int segment_id) {
+                                          int source_sad, int segment_id) {
   AV1_COMMON *const cm = &cpi->common;
   const int is_key_frame = frame_is_intra_only(cm);
   const int threshold_multiplier = is_key_frame ? 120 : 1;
@@ -351,14 +350,22 @@
   const int current_qindex = cm->quant_params.base_qindex;
 
   if (is_key_frame) {
+    if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+      threshold_base <<= cpi->oxcf.speed - 7;
+    }
     thresholds[0] = threshold_base;
     thresholds[1] = threshold_base;
     if (cm->width * cm->height < 1280 * 720) {
       thresholds[2] = threshold_base / 3;
       thresholds[3] = threshold_base >> 1;
     } else {
-      thresholds[2] = threshold_base >> 2;
-      thresholds[3] = threshold_base >> 2;
+      int shift_val = 2;
+      if (cpi->sf.rt_sf.force_large_partition_blocks_intra) {
+        shift_val = 0;
+      }
+
+      thresholds[2] = threshold_base >> shift_val;
+      thresholds[3] = threshold_base >> shift_val;
     }
     thresholds[4] = threshold_base << 2;
   } else {
@@ -394,7 +401,6 @@
         scale_part_thresh_content(threshold_base, cpi->oxcf.speed, cm->width,
                                   cm->height, cpi->svc.non_reference_frame);
 #endif
-
     thresholds[0] = threshold_base >> 1;
     thresholds[1] = threshold_base;
     thresholds[3] = threshold_base << cpi->oxcf.speed;
@@ -436,20 +442,45 @@
       thresholds[2] = (5 * threshold_base) >> 1;
     }
     if (cpi->sf.rt_sf.force_large_partition_blocks) {
+      double weight;
+      const int win = 20;
+      if (current_qindex < QINDEX_LARGE_BLOCK_THR - win)
+        weight = 1.0;
+      else if (current_qindex > QINDEX_LARGE_BLOCK_THR + win)
+        weight = 0.0;
+      else
+        weight =
+            1.0 - (current_qindex - QINDEX_LARGE_BLOCK_THR + win) / (2 * win);
+      if (cm->width * cm->height > 640 * 480) {
+        for (int i = 0; i < 4; i++) {
+          thresholds[i] <<= 1;
+        }
+      }
       if (cm->width * cm->height <= 352 * 288) {
         thresholds[1] <<= 2;
         thresholds[2] <<= 5;
         thresholds[3] = INT32_MAX;
-      } else if (cm->width * cm->height > 640 * 480 && segment_id == 0) {
+        // Condition the increase of partition thresholds on the segment
+        // and the content. Avoid the increase for superblocks which have
+        // high source sad, unless the whole frame has very high motion
+        // (i.e, cpi->rc.avg_source_sad is very large, in which case all blocks
+        // have high source sad).
+      } else if (cm->width * cm->height > 640 * 480 && segment_id == 0 &&
+                 (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
         thresholds[0] = (3 * thresholds[0]) >> 1;
         thresholds[3] = INT32_MAX;
-        if (current_qindex >= QINDEX_LARGE_BLOCK_THR) {
-          thresholds[1] <<= 1;
-          thresholds[2] <<= 1;
+        if (current_qindex > QINDEX_LARGE_BLOCK_THR) {
+          thresholds[1] = (int)((1 - weight) * (thresholds[1] << 1) +
+                                weight * thresholds[1]);
+          thresholds[2] = (int)((1 - weight) * (thresholds[2] << 1) +
+                                weight * thresholds[2]);
         }
-      } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0) {
-        thresholds[1] <<= 2;
-        thresholds[2] <<= 5;
+      } else if (current_qindex > QINDEX_LARGE_BLOCK_THR && segment_id == 0 &&
+                 (source_sad != kHighSad || cpi->rc.avg_source_sad > 50000)) {
+        thresholds[1] =
+            (int)((1 - weight) * (thresholds[1] << 2) + weight * thresholds[1]);
+        thresholds[2] =
+            (int)((1 - weight) * (thresholds[2] << 4) + weight * thresholds[2]);
         thresholds[3] = INT32_MAX;
       }
     }
@@ -605,7 +636,7 @@
         xd->mi[0]->mv[0].as_mv.col > -mv_thr &&
         xd->mi[0]->mv[0].as_mv.row < mv_thr &&
         xd->mi[0]->mv[0].as_mv.row > -mv_thr))) {
-    const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+    const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
     if (is_small_sb)
       set_low_temp_var_flag_64x64(&cm->mi_params, part_info, xd,
                                   &(vt->split[0]), thresholds, mi_col, mi_row);
@@ -621,7 +652,8 @@
   if (sf->part_sf.partition_search_type != VAR_BASED_PARTITION) {
     return;
   } else {
-    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0);
+    set_vbp_thresholds(cpi, cpi->vbp_info.thresholds, q, content_lowsumdiff, 0,
+                       0);
     // The threshold below is not changed locally.
     cpi->vbp_info.threshold_minmax = 15 + (q >> 3);
   }
@@ -643,10 +675,17 @@
         get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
 
     if (bs != BLOCK_INVALID)
-      uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
-                                   pd->dst.stride);
+      uv_sad = cpi->ppi->fn_ptr[bs].sdf(p->src.buf, p->src.stride, pd->dst.buf,
+                                        pd->dst.stride);
 
-    x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    if (uv_sad > (y_sad >> 1))
+      x->color_sensitivity_sb[i - 1] = 1;
+    else if (uv_sad < (y_sad >> 3))
+      x->color_sensitivity_sb[i - 1] = 0;
+    // Borderline case: to be refined at coding block level in nonrd_pickmode,
+    // for coding block size < sb_size.
+    else
+      x->color_sensitivity_sb[i - 1] = 2;
   }
 }
 
@@ -658,7 +697,7 @@
   AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int is_key_frame = frame_is_intra_only(cm);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
   // TODO(kyslov) Bring back compute_minmax_variance with content type detection
   const int compute_minmax_variance = 0;
@@ -772,7 +811,7 @@
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   const int num_planes = av1_num_planes(cm);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128;
   // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
   // is!!
@@ -783,13 +822,12 @@
 
   // For non-SVC GOLDEN is another temporal reference. Check if it should be
   // used as reference for partitioning.
-  if (!cpi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG) &&
-      cpi->sf.rt_sf.use_nonrd_pick_mode) {
+  if (!cpi->ppi->use_svc && (cpi->ref_frame_flags & AOM_GOLD_FLAG)) {
     yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
     if (yv12_g && yv12_g != yv12) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
-      *y_sad_g = cpi->fn_ptr[bsize].sdf(
+      *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
           xd->plane[0].pre[0].stride);
     }
@@ -799,25 +837,24 @@
                        get_ref_scale_factors(cm, LAST_FRAME), num_planes);
   mi->ref_frame[0] = LAST_FRAME;
   mi->ref_frame[1] = NONE_FRAME;
-  mi->bsize = cm->seq_params.sb_size;
+  mi->bsize = cm->seq_params->sb_size;
   mi->mv[0].as_int = 0;
   mi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
   if (cpi->sf.rt_sf.estimate_motion_for_var_based_partition) {
     if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
       const MV dummy_mv = { 0, 0 };
-      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size,
+      *y_sad = av1_int_pro_motion_estimation(cpi, x, cm->seq_params->sb_size,
                                              mi_row, mi_col, &dummy_mv);
     }
   }
   if (*y_sad == UINT_MAX) {
-    *y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                    xd->plane[0].pre[0].buf,
-                                    xd->plane[0].pre[0].stride);
+    *y_sad = cpi->ppi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+        xd->plane[0].pre[0].stride);
   }
 
   // Pick the ref frame for partitioning, use golden frame only if its
   // lower sad.
-  aom_clear_system_state();
   if (*y_sad_g < 0.9 * *y_sad) {
     av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                          get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes);
@@ -834,7 +871,7 @@
 
   set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
   av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
-                                cm->seq_params.sb_size, AOM_PLANE_Y,
+                                cm->seq_params->sb_size, AOM_PLANE_Y,
                                 AOM_PLANE_Y);
 }
 
@@ -849,7 +886,6 @@
   VP128x128 *vt;
   VP16x16 *vt2 = NULL;
   unsigned char force_split[85];
-  int avg_32x32;
   int avg_64x64;
   int max_var_32x32[4];
   int min_var_32x32[4];
@@ -869,12 +905,12 @@
 
   int is_key_frame =
       (frame_is_intra_only(cm) ||
-       (cpi->use_svc &&
+       (cpi->ppi->use_svc &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
 
-  assert(cm->seq_params.sb_size == BLOCK_64X64 ||
-         cm->seq_params.sb_size == BLOCK_128X128);
-  const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+  assert(cm->seq_params->sb_size == BLOCK_64X64 ||
+         cm->seq_params->sb_size == BLOCK_128X128);
+  const int is_small_sb = (cm->seq_params->sb_size == BLOCK_64X64);
   const int num_64x64_blocks = is_small_sb ? 1 : 4;
 
   unsigned int y_sad = UINT_MAX;
@@ -900,10 +936,12 @@
       cyclic_refresh_segment_id_boosted(segment_id) &&
       cpi->sf.rt_sf.use_nonrd_pick_mode) {
     int q = av1_get_qindex(&cm->seg, segment_id, cm->quant_params.base_qindex);
-    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff, 1);
+    set_vbp_thresholds(cpi, thresholds, q, x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad, 1);
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->quant_params.base_qindex,
-                       x->content_state_sb.low_sumdiff, 0);
+                       x->content_state_sb.low_sumdiff,
+                       x->content_state_sb.source_sad, 0);
   }
 
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
@@ -955,7 +993,6 @@
 
   avg_64x64 = 0;
   for (m = 0; m < num_64x64_blocks; ++m) {
-    avg_32x32 = 0;
     max_var_32x32[m] = 0;
     min_var_32x32[m] = INT_MAX;
     const int m2 = m << 2;
@@ -1008,7 +1045,6 @@
           force_split[m + 1] = 1;
           force_split[0] = 1;
         }
-        avg_32x32 += var_32x32;
       }
     }
     if (!force_split[1 + m]) {
@@ -1025,7 +1061,7 @@
       if (!is_key_frame &&
           (max_var_32x32[m] - min_var_32x32[m]) > 3 * (thresholds[1] >> 3) &&
           max_var_32x32[m] > thresholds[1] >> 1 &&
-          (noise_level >= kMedium || cpi->use_svc ||
+          (noise_level >= kMedium || cpi->ppi->use_svc ||
            cpi->sf.rt_sf.force_large_partition_blocks ||
            !cpi->sf.rt_sf.use_nonrd_pick_mode)) {
         force_split[1 + m] = 1;

diff --git a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 634d50b..3c9191e 100644
--- a/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/av1/encoder/x86/av1_fwd_txfm2d_avx2.c

@@ -1704,8 +1704,8 @@
     }
     fdct64_new_avx2(bufA, bufA, cos_bit_row);
     fdct64_new_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
-    av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+    round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+    round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {
@@ -1843,8 +1843,8 @@
     }
     fdct64_new_avx2(bufA, bufA, cos_bit_row);
     fdct64_new_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
-    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {
@@ -1893,8 +1893,8 @@
     }
     fdct32_avx2(bufA, bufA, cos_bit_row);
     fdct32_avx2(bufB, bufB, cos_bit_row);
-    av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
-    av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
+    round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
 
     int32_t *output8 = output + 16 * 32 * i;
     for (int j = 0; j < 4; ++j) {

diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index f5f7ee1..591edd7 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c

@@ -154,22 +154,18 @@
   return _mm_extract_epi16(eob, 1);
 }
 
-static INLINE void store_zero_tran_low(int16_t *a) {
-  const __m256i zero = _mm256_setzero_si256();
-  _mm256_storeu_si256((__m256i *)(a), zero);
-}
-
 void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan) {
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
   __m128i eob;
   __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
+  __m256i eob256;
 
   coeff_ptr += n_coeffs;
-  scan += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -205,7 +201,7 @@
       _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
     }
 
-    eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256);
+    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
     n_coeffs += 8 * 2;
   }
 
@@ -214,30 +210,22 @@
   quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
   round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
 
-  thr256 = _mm256_srai_epi16(dequant256, 1);
-
   // AC only loop
   while (n_coeffs < 0) {
     __m256i coeff256 =
         _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs));
     __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
 
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
-      eob256 = _mm256_max_epi16(
-          eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
+    __m256i qtmp256;
+    qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+    qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+    qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+    _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256);
+    coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256);
+    eob256 = _mm256_max_epi16(
+        eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
+
     n_coeffs += 8 * 2;
   }
 

diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index 5497c7e..b533894 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c

@@ -15,6 +15,7 @@
 #include "config/av1_rtcd.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
@@ -187,3 +188,102 @@
     *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
+
+static INLINE void quantize_lp(const int16_t *iscan_ptr,
+                               const int16_t *coeff_ptr, intptr_t n_coeffs,
+                               int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                               const __m128i *round0, const __m128i *round1,
+                               const __m128i *quant0, const __m128i *quant1,
+                               const __m128i *dequant0, const __m128i *dequant1,
+                               __m128i *eob) {
+  const int16_t *read = coeff_ptr + n_coeffs;
+  __m128i coeff0 = _mm_load_si128((const __m128i *)read);
+  __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
+
+  // Poor man's sign extract
+  const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+  __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+  const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+  const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+  // Reinsert signs
+  qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+  qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+  qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+  qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+  int16_t *addr = qcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, qcoeff0);
+  _mm_store_si128((__m128i *)addr + 1, qcoeff1);
+
+  coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+  coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+  addr = dqcoeff_ptr + n_coeffs;
+  _mm_store_si128((__m128i *)addr, coeff0);
+  _mm_store_si128((__m128i *)addr + 1, coeff1);
+
+  const __m128i zero = _mm_setzero_si128();
+  // Scan for eob
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+  const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+  const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+
+  const __m128i iscan0 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+  const __m128i iscan1 =
+      _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+
+  // Add one to convert from indices to counts
+  const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+  const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+  const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+  const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+  const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+  *eob = _mm_max_epi16(*eob, eob2);
+}
+
+void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+  const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+  const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+  const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+  const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+  const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+  __m128i eob = _mm_setzero_si128();
+
+  // DC and first 15 AC
+  quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+              &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
+  n_coeffs += 8 * 2;
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+                &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  *eob_ptr = accumulate_eob(eob);
+}

diff --git a/av1/encoder/x86/corner_match_avx2.c b/av1/encoder/x86/corner_match_avx2.c
index 8d7eb3f..033ae37 100644
--- a/av1/encoder/x86/corner_match_avx2.c
+++ b/av1/encoder/x86/corner_match_avx2.c

@@ -15,7 +15,6 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
@@ -76,6 +75,5 @@
 
   int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
   int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }

diff --git a/av1/encoder/x86/corner_match_sse4.c b/av1/encoder/x86/corner_match_sse4.c
index 5c9ca20..1a879da 100644
--- a/av1/encoder/x86/corner_match_sse4.c
+++ b/av1/encoder/x86/corner_match_sse4.c

@@ -19,7 +19,6 @@
 #include "config/av1_rtcd.h"
 
 #include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
 #include "av1/encoder/corner_match.h"
 
 DECLARE_ALIGNED(16, static const uint8_t,
@@ -100,6 +99,5 @@
 
   int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
   int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
-  aom_clear_system_state();
   return cov / sqrt((double)var2);
 }

diff --git a/av1/encoder/x86/error_intrin_sse2.c b/av1/encoder/x86/error_intrin_sse2.c
new file mode 100644
index 0000000..e876db1
--- /dev/null
+++ b/av1/encoder/x86/error_intrin_sse2.c

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static AOM_INLINE __m128i reduce_sum_epi64(__m128i reg) {
+  __m128i reg_hi = _mm_srli_si128(reg, 8);
+  reg = _mm_add_epi64(reg, reg_hi);
+
+  return reg;
+}
+
+int64_t av1_block_error_lp_sse2(const int16_t *coeff, const int16_t *dqcoeff,
+                                intptr_t block_size) {
+  assert(block_size % 16 == 0);
+  assert(block_size >= 16);
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum_0 = zero;
+  __m128i accum_1 = zero;
+
+  for (int i = 0; i < block_size; i += 16) {
+    // Load 8 elements for coeff and dqcoeff.
+    const __m128i _coeff_0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i _coeff_1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i _dqcoeff_0 = _mm_loadu_si128((const __m128i *)dqcoeff);
+    const __m128i _dqcoeff_1 = _mm_loadu_si128((const __m128i *)(dqcoeff + 8));
+    // Compute the diff
+    const __m128i diff_0 = _mm_sub_epi16(_dqcoeff_0, _coeff_0);
+    const __m128i diff_1 = _mm_sub_epi16(_dqcoeff_1, _coeff_1);
+    // Compute the error
+    const __m128i error_0 = _mm_madd_epi16(diff_0, diff_0);
+    const __m128i error_1 = _mm_madd_epi16(diff_1, diff_1);
+
+    const __m128i error_lo_0 = _mm_unpacklo_epi32(error_0, zero);
+    const __m128i error_lo_1 = _mm_unpacklo_epi32(error_1, zero);
+    const __m128i error_hi_0 = _mm_unpackhi_epi32(error_0, zero);
+    const __m128i error_hi_1 = _mm_unpackhi_epi32(error_1, zero);
+
+    // Accumulate
+    accum_0 = _mm_add_epi64(accum_0, error_lo_0);
+    accum_1 = _mm_add_epi64(accum_1, error_lo_1);
+    accum_0 = _mm_add_epi64(accum_0, error_hi_0);
+    accum_1 = _mm_add_epi64(accum_1, error_hi_1);
+
+    // Advance
+    coeff += 16;
+    dqcoeff += 16;
+  }
+
+  __m128i accum = _mm_add_epi64(accum_0, accum_1);
+  // Reduce sum the register
+  accum = reduce_sum_epi64(accum);
+
+  // Store the results.
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(accum);
+#else
+  int64_t result;
+  _mm_storel_epi64((__m128i *)&result, accum);
+  return result;
+#endif  // ARCH_X86_64
+}

diff --git a/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/av1/encoder/x86/highbd_fwd_txfm_avx2.c
index a81378c..1faa412 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_avx2.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_avx2.c

@@ -1335,7 +1335,7 @@
   row_txfm(in, out, bit, 2, 2);
   fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
   fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
-  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
   store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }
@@ -1396,7 +1396,7 @@
   row_txfm(in, out, bit, 1, 1);
   fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
   fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
-  av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+  round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
   store_buffer_avx2(in, coeff, 8, 16);
   (void)bd;
 }

diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 9a0a36c1..73f9b44 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c

@@ -11,16 +11,70 @@
 #include <assert.h>
 #include <smmintrin.h> /* SSE4.1 */
 
-#include "config/aom_config.h"
-#include "config/av1_rtcd.h"
-
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
 #include "av1/common/av1_txfm.h"
 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride) {
+  __m128i in[4];
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  // Convert to int32_t.
+  __m128i op[4];
+  op[0] = _mm_cvtepi16_epi32(in[0]);
+  op[1] = _mm_cvtepi16_epi32(in[1]);
+  op[2] = _mm_cvtepi16_epi32(in[2]);
+  op[3] = _mm_cvtepi16_epi32(in[3]);
+
+  for (int i = 0; i < 2; ++i) {
+    __m128i a1 = op[0];
+    __m128i b1 = op[1];
+    __m128i c1 = op[2];
+    __m128i d1 = op[3];
+    __m128i e1;
+
+    a1 = _mm_add_epi32(a1, b1);  // a1 += b1
+    d1 = _mm_sub_epi32(d1, c1);  // d1 = d1 - c1
+    e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
+    e1 = _mm_srai_epi32(e1, 1);
+    b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
+    c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
+    a1 = _mm_sub_epi32(a1, c1);  // a1 -= c1
+    d1 = _mm_add_epi32(d1, b1);  // d1 += b1
+
+    op[0] = a1;
+    op[1] = c1;
+    op[2] = d1;
+    op[3] = b1;
+
+    transpose_32bit_4x4(op, op);
+  }
+
+  op[0] = _mm_slli_epi32(op[0], UNIT_QUANT_SHIFT);
+  op[1] = _mm_slli_epi32(op[1], UNIT_QUANT_SHIFT);
+  op[2] = _mm_slli_epi32(op[2], UNIT_QUANT_SHIFT);
+  op[3] = _mm_slli_epi32(op[3], UNIT_QUANT_SHIFT);
+
+  _mm_storeu_si128((__m128i *)(output + 0), op[0]);
+  _mm_storeu_si128((__m128i *)(output + 4), op[1]);
+  _mm_storeu_si128((__m128i *)(output + 8), op[2]);
+  _mm_storeu_si128((__m128i *)(output + 12), op[3]);
+}
+
+void av1_highbd_fwht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  av1_fwht4x4_sse4_1(input, output, stride);
+}
 
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride, int flipud, int fliplr,

diff --git a/av1/encoder/x86/highbd_temporal_filter_avx2.c b/av1/encoder/x86/highbd_temporal_filter_avx2.c
index b5477ec..68509fa 100644
--- a/av1/encoder/x86/highbd_temporal_filter_avx2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_avx2.c

@@ -352,10 +352,16 @@
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -393,6 +399,7 @@
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion

diff --git a/av1/encoder/x86/highbd_temporal_filter_sse2.c b/av1/encoder/x86/highbd_temporal_filter_sse2.c
index bbb3771..1bfdaf7 100644
--- a/av1/encoder/x86/highbd_temporal_filter_sse2.c
+++ b/av1/encoder/x86/highbd_temporal_filter_sse2.c

@@ -227,10 +227,16 @@
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -268,6 +274,7 @@
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion

diff --git a/av1/encoder/x86/ml_sse3.c b/av1/encoder/x86/ml_sse3.c
index 89b1e6a..ab69088 100644
--- a/av1/encoder/x86/ml_sse3.c
+++ b/av1/encoder/x86/ml_sse3.c

@@ -242,3 +242,95 @@
   }
   if (reduce_prec) av1_nn_output_prec_reduce(output, nn_config->num_outputs);
 }
+
+// Based on N. N. Schraudolph. A Fast, Compact Approximation of the Exponential
+// Function. Neural Computation, 11(4):853–862, 1999.
+static AOM_INLINE __m128 approx_exp(__m128 y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  const __m128 multiplier = _mm_set1_ps(A);
+  const __m128i offset = _mm_set1_epi32(B * (1 << 23) - C);
+
+  y = _mm_mul_ps(y, multiplier);
+  y = _mm_castsi128_ps(_mm_add_epi32(_mm_cvtps_epi32(y), offset));
+  return y;
+#undef A
+#undef B
+#undef C
+}
+
+static AOM_INLINE __m128 reduce_max(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_max_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+static AOM_INLINE __m128 reduce_sum(__m128 reg) {
+  __m128 tmp_reg;
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0x4e);  // 01 00 11 10
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  tmp_reg = _mm_shuffle_ps(reg, reg, 0xb1);  // 10 11 00 01
+  reg = _mm_add_ps(reg, tmp_reg);
+
+  return reg;
+}
+
+void av1_nn_fast_softmax_16_sse3(const float *input, float *output) {
+  // Clips at -10 to avoid underflowing
+  const __m128 clipper = _mm_set1_ps(-10.0f);
+
+  // Load in 16 values
+  __m128 in_0 = _mm_loadu_ps(&input[0]);
+  __m128 in_1 = _mm_loadu_ps(&input[4]);
+  __m128 in_2 = _mm_loadu_ps(&input[8]);
+  __m128 in_3 = _mm_loadu_ps(&input[12]);
+
+  // Get the max
+  __m128 max_0 = _mm_max_ps(in_0, in_1);
+  __m128 max_1 = _mm_max_ps(in_2, in_3);
+
+  max_0 = _mm_max_ps(max_0, max_1);
+  max_0 = reduce_max(max_0);
+
+  // Subtract the max off and clip
+  in_0 = _mm_sub_ps(in_0, max_0);
+  in_1 = _mm_sub_ps(in_1, max_0);
+  in_2 = _mm_sub_ps(in_2, max_0);
+  in_3 = _mm_sub_ps(in_3, max_0);
+
+  in_0 = _mm_max_ps(in_0, clipper);
+  in_1 = _mm_max_ps(in_1, clipper);
+  in_2 = _mm_max_ps(in_2, clipper);
+  in_3 = _mm_max_ps(in_3, clipper);
+
+  // Exponentiate and compute the denominator
+  __m128 sum = in_0 = approx_exp(in_0);
+  in_1 = approx_exp(in_1);
+  sum = _mm_add_ps(sum, in_1);
+  in_2 = approx_exp(in_2);
+  sum = _mm_add_ps(sum, in_2);
+  in_3 = approx_exp(in_3);
+  sum = _mm_add_ps(sum, in_3);
+  sum = reduce_sum(sum);
+
+  // Divide to get the probability
+  in_0 = _mm_div_ps(in_0, sum);
+  in_1 = _mm_div_ps(in_1, sum);
+  in_2 = _mm_div_ps(in_2, sum);
+  in_3 = _mm_div_ps(in_3, sum);
+
+  _mm_storeu_ps(&output[0], in_0);
+  _mm_storeu_ps(&output[4], in_1);
+  _mm_storeu_ps(&output[8], in_2);
+  _mm_storeu_ps(&output[12], in_3);
+}

diff --git a/av1/encoder/x86/rdopt_avx2.c b/av1/encoder/x86/rdopt_avx2.c
index fefc036..3bc763c 100644
--- a/av1/encoder/x86/rdopt_avx2.c
+++ b/av1/encoder/x86/rdopt_avx2.c

@@ -13,7 +13,6 @@
 #include <immintrin.h>
 #include "aom_dsp/x86/mem_sse2.h"
 #include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -228,8 +227,6 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
-  aom_clear_system_state();
-
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 

diff --git a/av1/encoder/x86/rdopt_sse4.c b/av1/encoder/x86/rdopt_sse4.c
index 67d94b4..4c4ec1f 100644
--- a/av1/encoder/x86/rdopt_sse4.c
+++ b/av1/encoder/x86/rdopt_sse4.c

@@ -12,7 +12,6 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/system_state.h"
 
 #include "config/av1_rtcd.h"
 #include "av1/encoder/rdopt.h"
@@ -246,8 +245,6 @@
   int64_t y2_sum = x2_sum - x2_firstcol;
   int64_t z2_sum = x2_sum - x2_firstrow;
 
-  aom_clear_system_state();
-
   const float num_hor = (float)(height * (width - 1));
   const float num_ver = (float)((height - 1) * width);
 

diff --git a/av1/encoder/x86/reconinter_enc_sse2.c b/av1/encoder/x86/reconinter_enc_sse2.c
new file mode 100644
index 0000000..6455bf3
--- /dev/null
+++ b/av1/encoder/x86/reconinter_enc_sse2.c

@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "av1/common/av1_common_int.h"
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/reconinter_enc.h"
+
+void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
+                             int mi_row, int mi_col, const MV *const mv,
+                             uint8_t *comp_pred, int width, int height,
+                             int subpel_x_q3, int subpel_y_q3,
+                             const uint8_t *ref, int ref_stride,
+                             int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+  // 2-tap yet.
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    if (width >= 16) {
+      int i;
+      assert(!(width & 15));
+      /*Read 16 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 16) {
+          xx_storeu_128(comp_pred, xx_loadu_128(ref));
+          comp_pred += 16;
+          ref += 16;
+        }
+        ref += ref_stride - width;
+      }
+    } else if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      assert(!(height & 1));
+      /*Read 8 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
+        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
+        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
+        comp_pred += 16;
+        ref += 2 * ref_stride;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      assert(!(height & 3));
+      /*Read 4 pixels four rows at a time.*/
+      for (i = 0; i < height; i++) {
+        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
+        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
+        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
+        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
+        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
+                                               _mm_unpacklo_epi32(row2, row3));
+        xx_storeu_128(comp_pred, reg);
+        comp_pred += 16;
+        ref += 4 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
+                        width, height);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
+                       width, height);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
+    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                    : temp;
+    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
+                        kernel_x, 16, NULL, -1, width, intermediate_height);
+    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
+                       kernel_y, 16, width, height);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
+                                    const struct AV1Common *const cm,
+                                    int mi_row, int mi_col, const MV *const mv,
+                                    uint8_t *comp_pred8, int width, int height,
+                                    int subpel_x_q3, int subpel_y_q3,
+                                    const uint8_t *ref8, int ref_stride, int bd,
+                                    int subpel_search) {
+  // expect xd == NULL only in tests
+  if (xd != NULL) {
+    const MB_MODE_INFO *mi = xd->mi[0];
+    const int ref_num = 0;
+    const int is_intrabc = is_intrabc_block(mi);
+    const struct scale_factors *const sf =
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
+    const int is_scaled = av1_is_scaled(sf);
+
+    if (is_scaled) {
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const struct buf_2d *const dst_buf = &pd->dst;
+      const struct buf_2d *const pre_buf =
+          is_intrabc ? dst_buf : &pd->pre[ref_num];
+
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
+          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
+      return;
+    }
+  }
+
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
+  if (!subpel_x_q3 && !subpel_y_q3) {
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+    if (width >= 8) {
+      int i;
+      assert(!(width & 7));
+      /*Read 8 pixels one row at a time.*/
+      for (i = 0; i < height; i++) {
+        int j;
+        for (j = 0; j < width; j += 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          _mm_storeu_si128((__m128i *)comp_pred, s0);
+          comp_pred += 8;
+          ref += 8;
+        }
+        ref += ref_stride - width;
+      }
+    } else {
+      int i;
+      assert(!(width & 3));
+      /*Read 4 pixels two rows at a time.*/
+      for (i = 0; i < height; i += 2) {
+        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
+        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
+        _mm_storeu_si128((__m128i *)comp_pred, t0);
+        comp_pred += 8;
+        ref += 2 * ref_stride;
+      }
+    }
+  } else if (!subpel_y_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
+                               NULL, -1, width, height, bd);
+  } else if (!subpel_x_q3) {
+    const int16_t *const kernel =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
+                              kernel, 16, width, height, bd);
+  } else {
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
+    const int16_t *const kernel_x =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
+    const int16_t *const kernel_y =
+        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                     : temp;
+    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
+    const int intermediate_height =
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
+    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
+    aom_highbd_convolve8_horiz(
+        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
+                              height, bd);
+  }
+}
+
+void aom_highbd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, int subpel_search) {
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
+  assert(!(width * height & 7));
+  int n = width * height >> 3;
+  for (int i = 0; i < n; i++) {
+    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
+    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
+    int subpel_search) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  int n;
+  int i;
+  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
+                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
+                            bd, subpel_search);
+  assert(!(width * height & 7));
+  n = width * height >> 3;
+
+  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
+  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
+  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred16);
+    __m128i p1 = xx_loadu_128(pred);
+
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+
+    comp_pred16 += 8;
+    pred += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_avg_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+  for (i = 0; i < n; i++) {
+    __m128i s0 = xx_loadu_128(comp_pred);
+    __m128i p0 = xx_loadu_128(pred);
+    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
+    comp_pred += 16;
+    pred += 16;
+  }
+}
+
+void aom_comp_mask_upsampled_pred_sse2(
+    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
+    int subpel_search) {
+  if (subpel_x_q3 | subpel_y_q3) {
+    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
+                       subpel_search);
+    ref = comp_pred;
+    ref_stride = width;
+  }
+  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
+                     mask_stride, invert_mask);
+}

diff --git a/av1/encoder/x86/reconinter_enc_ssse3.c b/av1/encoder/x86/reconinter_enc_ssse3.c
new file mode 100644
index 0000000..7ac0f0d
--- /dev/null
+++ b/av1/encoder/x86/reconinter_enc_ssse3.c

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
+    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
+    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
+  int n;
+  int i;
+  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
+                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
+  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
+  assert(!(width * height & 15));
+  n = width * height >> 4;
+
+  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
+  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r =
+      _mm_set_epi16(round, round, round, round, round, round, round, round);
+
+  for (i = 0; i < n; i++) {
+    __m128i p0 = xx_loadu_128(comp_pred);
+    __m128i p1 = xx_loadu_128(pred);
+
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+    comp_pred += 16;
+    pred += 16;
+  }
+}

diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index 72914e1..8aa0764 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c

@@ -238,10 +238,16 @@
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -277,6 +283,7 @@
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion

diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index d70792c..26c3926 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c

@@ -215,10 +215,16 @@
                                    TF_SEARCH_ERROR_NORM_WEIGHT);
   const double weight_factor =
       (double)TF_WINDOW_BLOCK_BALANCE_WEIGHT * inv_factor;
-  // Decay factors for non-local mean approach.
-  // Smaller q -> smaller filtering weight.
+  // Adjust filtering based on q.
+  // Larger q -> stronger filtering -> larger weight.
+  // Smaller q -> weaker filtering -> smaller weight.
   double q_decay = pow((double)q_factor / TF_Q_DECAY_THRESHOLD, 2);
   q_decay = CLIP(q_decay, 1e-5, 1);
+  if (q_factor >= TF_QINDEX_CUTOFF) {
+    // Max q_factor is 255, therefore the upper bound of q_decay is 8.
+    // We do not need a clip here.
+    q_decay = 0.5 * pow((double)q_factor / 64, 2);
+  }
   // Smaller strength -> smaller filtering weight.
   double s_decay = pow((double)filter_strength / TF_STRENGTH_THRESHOLD, 2);
   s_decay = CLIP(s_decay, 1e-5, 1);
@@ -254,6 +260,7 @@
     const double inv_num_ref_pixels = 1.0 / num_ref_pixels;
     // Larger noise -> larger filtering weight.
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
+    // Decay factors for non-local mean approach.
     const double decay_factor = 1 / (n_decay * q_decay * s_decay);
 
     // Filter U-plane and V-plane using Y-plane. This is because motion

diff --git a/av1/exports_dec b/av1/exports_dec
index daabf67..e9e0865 100644
--- a/av1/exports_dec
+++ b/av1/exports_dec

@@ -1,3 +1,3 @@
 data aom_codec_av1_dx_algo
 text aom_codec_av1_dx
-text av1_add_film_grain
+text aom_add_film_grain

diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 89516e5..fde8a45 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake

@@ -71,6 +71,11 @@
 set_aom_config_var(CONFIG_AV1_DECODER 1 "Enable AV1 decoder.")
 set_aom_config_var(CONFIG_AV1_ENCODER 1 "Enable AV1 encoder.")
 set_aom_config_var(CONFIG_BIG_ENDIAN 0 "Internal flag.")
+set_aom_config_var(CONFIG_FRAME_PARALLEL_ENCODE 0
+                   "Enable frame parallelism during encode.")
+set_aom_config_var(
+  CONFIG_FRAME_PARALLEL_ENCODE_2 0
+  "Enable frame parallelism during encode for frames in lower layer depths.")
 set_aom_config_var(CONFIG_GCC 0 "Building with GCC (detect).")
 set_aom_config_var(CONFIG_GCOV 0 "Enable gcov support.")
 set_aom_config_var(CONFIG_GPROF 0 "Enable gprof support.")
@@ -120,30 +125,39 @@
 set_aom_config_var(STATIC_LINK_JXL 0 "Statically link the JPEG-XL library.")
 
 # AV1 experiment flags.
-set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SPEED_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_DIST_8X8 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_ENTROPY_STATS 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 "AV1 experiment.")
 set_aom_config_var(CONFIG_BITSTREAM_DEBUG 0
-                   "AV1 experiment flag for bitstream debugging.")
-set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment flag.")
-set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
-                   "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
-set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0
-                   "Collect partition timing stats. Can be 1 or 2.")
-set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0
-                   "Collect encoding component timing information.")
-set_aom_config_var(CONFIG_LPF_MASK 0
-                   "Enable the use loop filter bitmasks for optimizations.")
-set_aom_config_var(CONFIG_NN_V2 0 "Fully-connected neural nets ver.2.")
+                   "AV1 experiment: Bitstream debugging.")
+set_aom_config_var(CONFIG_RD_DEBUG 0 "AV1 experiment.")
+set_aom_config_var(CONFIG_RD_COMMAND 0
+                   "AV1 experiment: Use external rdmult and q_index.")
+set_aom_config_var(CONFIG_SHARP_SETTINGS 0 "AV1 experiment.")
+set_aom_config_var(
+  CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1
+  "AV1 experiment: Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(
+  CONFIG_COLLECT_PARTITION_STATS 0
+  "AV1 experiment: Collect partition timing stats. Can be 1 or 2.")
+set_aom_config_var(
+  CONFIG_COLLECT_COMPONENT_TIMING 0
+  "AV1 experiment: Collect encoding component timing information.")
+set_aom_config_var(CONFIG_NN_V2 0
+                   "AV1 experiment: Fully-connected neural nets ver.2.")
 set_aom_config_var(CONFIG_OPTICAL_FLOW_API 0
-                   "AV1 experiment flag for optical flow API.")
-set_aom_config_var(CONFIG_RT_ML_PARTITIONING 0
-                   "Build with ML-based partitioning for Real Time.")
+                   "AV1 experiment: for optical flow API.")
+set_aom_config_var(
+  CONFIG_RT_ML_PARTITIONING 0
+  "AV1 experiment: Build with ML-based partitioning for Real Time.")
 set_aom_config_var(CONFIG_PARTITION_SEARCH_ORDER 0
-                   "AV1 experiment with alternative partition search order.")
+                   "AV1 experiment: Use alternative partition search order.")
+set_aom_config_var(CONFIG_BITRATE_ACCURACY 0
+                   "AV1 experiment: Improve bitrate accuracy.")
+set_aom_config_var(CONFIG_THREE_PASS 0
+                   "AV1 experiment: Enable three-pass encoding.")
 
 #
 # Variables in this section control optional features of the build system.

diff --git a/build/cmake/aom_install.cmake b/build/cmake/aom_install.cmake
index 0bd2bf0..7b5cbbc 100644
--- a/build/cmake/aom_install.cmake
+++ b/build/cmake/aom_install.cmake

@@ -19,7 +19,8 @@
 
 if(CONFIG_AV1_ENCODER)
   list(APPEND AOM_INSTALL_INCS "${AOM_ROOT}/aom/aomcx.h"
-              "${AOM_ROOT}/aom/aom_encoder.h")
+              "${AOM_ROOT}/aom/aom_encoder.h"
+              "${AOM_ROOT}/aom/aom_external_partition.h")
 endif()
 
 # Generate aom.pc and setup dependencies to ensure it is created when necessary.

diff --git a/build/cmake/aom_optimization.cmake b/build/cmake/aom_optimization.cmake
index e4b29de..9c2afe9 100644
--- a/build/cmake/aom_optimization.cmake
+++ b/build/cmake/aom_optimization.cmake

@@ -44,6 +44,7 @@
   endif()
   set(target_name ${target_to_update}_${opt_name}_intrinsics)
   add_library(${target_name} OBJECT ${${sources}})
+  set_property(TARGET ${target_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
 
   if(MSVC)
     get_msvc_intrinsic_flag(${flag} "flag")
@@ -142,6 +143,7 @@
   # targets, make this OBJECT instead of STATIC to hide the target from
   # consumers of the AOM cmake build.
   add_library(${lib_name} STATIC ${${asm_sources}})
+  set_property(TARGET ${lib_name} PROPERTY FOLDER ${AOM_TARGET_CPU})
 
   foreach(asm_source ${${asm_sources}})
     get_filename_component(asm_source_name "${asm_source}" NAME)
@@ -238,5 +240,5 @@
     WORKING_DIRECTORY ${AOM_CONFIG_DIR}
     VERBATIM)
   set_property(SOURCE ${source} PROPERTY OBJECT_DEPENDS ${output})
-  set_property(SOURCE ${output} PROPERTY GENERATED)
+  set_property(SOURCE ${output} PROPERTY GENERATED TRUE)
 endfunction()

diff --git a/common/ivf_dec.cmake b/common/ivf_dec.cmake
new file mode 100644
index 0000000..fedeea7
--- /dev/null
+++ b/common/ivf_dec.cmake

@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2021, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_COMMON_IVF_DEC_CMAKE_)
+  return()
+endif() # AOM_COMMON_AOM_COMMON_CMAKE_
+set(AOM_COMMON_IVF_DEC_CMAKE_ 1)
+
+list(APPEND IVF_DEC_SOURCES "${AOM_ROOT}/common/ivfdec.c"
+            "${AOM_ROOT}/common/ivfdec.h")
+
+# Creates the aom_common build target and makes libaom depend on it. The libaom
+# target must exist before this function is called.
+function(setup_ivf_dec_targets)
+  add_library(ivf_dec OBJECT ${IVF_DEC_SOURCES})
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} ivf_dec PARENT_SCOPE)
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:ivf_dec>)
+  endif()
+endfunction()

diff --git a/common/ivfdec.c b/common/ivfdec.c
index 80d73b0..18f053e 100644
--- a/common/ivfdec.c
+++ b/common/ivfdec.c

@@ -39,7 +39,7 @@
       if (mem_get_le16(raw_hdr + 4) != 0) {
         fprintf(stderr,
                 "Error: Unrecognized IVF version! This file may not"
-                " decode properly.");
+                " decode properly.\n");
       }
 
       input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
@@ -67,12 +67,13 @@
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size");
+    if (!feof(infile)) fprintf(stderr, "Warning: Failed to read frame size\n");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
+      fprintf(stderr, "Warning: Read invalid frame size (%u)\n",
+              (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -83,7 +84,7 @@
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer");
+        fprintf(stderr, "Warning: Failed to allocate compressed data buffer\n");
         frame_size = 0;
       }
     }
@@ -97,7 +98,7 @@
   if (!feof(infile)) {
     ASAN_UNPOISON_MEMORY_REGION(*buffer, *buffer_size);
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame");
+      fprintf(stderr, "Warning: Failed to read full frame\n");
       return 1;
     }
 

diff --git a/common/tools_common.c b/common/tools_common.c
index 2b199a5..280d49a 100644
--- a/common/tools_common.c
+++ b/common/tools_common.c

@@ -66,7 +66,7 @@
   exit(EXIT_FAILURE);
 }
 
-void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
+void aom_tools_warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 
 void die_codec(aom_codec_ctx_t *ctx, const char *s) {
   const char *detail = aom_codec_error_detail(ctx);

diff --git a/common/tools_common.h b/common/tools_common.h
index 025bca2..f5b5b19 100644
--- a/common/tools_common.h
+++ b/common/tools_common.h

@@ -125,6 +125,8 @@
 
 #if defined(__GNUC__)
 #define AOM_NO_RETURN __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define AOM_NO_RETURN __declspec(noreturn)
 #else
 #define AOM_NO_RETURN
 #endif
@@ -132,14 +134,14 @@
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-void die(const char *fmt, ...) AOM_NO_RETURN;
-void fatal(const char *fmt, ...) AOM_NO_RETURN;
-void warn(const char *fmt, ...);
+AOM_NO_RETURN void die(const char *fmt, ...);
+AOM_NO_RETURN void fatal(const char *fmt, ...);
+void aom_tools_warn(const char *fmt, ...);
 
-void die_codec(aom_codec_ctx_t *ctx, const char *s) AOM_NO_RETURN;
+AOM_NO_RETURN void die_codec(aom_codec_ctx_t *ctx, const char *s);
 
 /* The tool including this file must define usage_exit() */
-void usage_exit(void) AOM_NO_RETURN;
+AOM_NO_RETURN void usage_exit(void);
 
 #undef AOM_NO_RETURN
 

diff --git a/common/warnings.c b/common/warnings.c
index 2facee2..308cecd 100644
--- a/common/warnings.c
+++ b/common/warnings.c

@@ -86,7 +86,7 @@
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    aom_tools_warn(warning->warning_string);
   }
 
   free_warning_list(&warning_list);

diff --git a/common/y4minput.c b/common/y4minput.c
index abd4d0b..8e20b49 100644
--- a/common/y4minput.c
+++ b/common/y4minput.c

@@ -896,8 +896,9 @@
     return -1;
   }
   if (csp == AOM_CSP_COLOCATED) {
-    fprintf(stderr, "Colocated chroma sample position not supported in Y4M\n");
-    return -1;
+    // TODO(any): check the right way to handle this in y4m
+    fprintf(stderr,
+            "Ignoring colocated chroma sample position for reading in Y4M\n");
   }
   y4m_ctx->aom_fmt = AOM_IMG_FMT_I420;
   y4m_ctx->bps = 12;

diff --git a/doc/dev_guide/av1_encoder.dox b/doc/dev_guide/av1_encoder.dox
index 9079d5c..2f0c9e3 100644
--- a/doc/dev_guide/av1_encoder.dox
+++ b/doc/dev_guide/av1_encoder.dox

@@ -103,15 +103,18 @@
 The following are the main high level data structures used by the libaom AV1
 encoder and referenced elsewhere in this overview document:
 
+- \ref AV1_PRIMARY
+    - \ref AV1_PRIMARY.gf_group (\ref GF_GROUP)
+    - \ref AV1_PRIMARY.lap_enabled
+    - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+    - \ref AV1_PRIMARY.p_rc (\ref PRIMARY_RATE_CONTROL)
+    - \ref AV1_PRIMARY.alt_ref_buffer (\ref yv12_buffer_config)
+
 - \ref AV1_COMP
     - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
-    - \ref AV1_COMP.alt_ref_buffer (\ref yv12_buffer_config)
     - \ref AV1_COMP.rc (\ref RATE_CONTROL)
-    - \ref AV1_COMP.twopass (\ref TWO_PASS)
-    - \ref AV1_COMP.gf_group (\ref GF_GROUP)
     - \ref AV1_COMP.speed
     - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
-    - \ref AV1_COMP.lap_enabled
 
 - \ref AV1EncoderConfig (Encoder configuration parameters)
     - \ref AV1EncoderConfig.pass
@@ -141,10 +144,12 @@
     - \ref RateControlCfg.vbrmin_section
     - \ref RateControlCfg.vbrmax_section
 
+- \ref PRIMARY_RATE_CONTROL (Primary Rate control status)
+    - \ref PRIMARY_RATE_CONTROL.gf_intervals[]
+    - \ref PRIMARY_RATE_CONTROL.cur_gf_index
+
 - \ref RATE_CONTROL (Rate control status)
     - \ref RATE_CONTROL.intervals_till_gf_calculate_due
-    - \ref RATE_CONTROL.gf_intervals[]
-    - \ref RATE_CONTROL.cur_gf_index
     - \ref RATE_CONTROL.frames_till_gf_update_due
     - \ref RATE_CONTROL.frames_to_key
 
@@ -322,9 +327,11 @@
 The following are the main data structures referenced in this section
 (see also \ref architecture_enc_data_structures):
 
+- \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+    - \ref AV1_PRIMARY.alt_ref_buffer (\ref yv12_buffer_config)
+
 - \ref AV1_COMP cpi (the main compressor instance data structure)
     - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
-    - \ref AV1_COMP.alt_ref_buffer (\ref yv12_buffer_config)
 
 - \ref AV1EncoderConfig (Encoder configuration parameters)
     - \ref AV1EncoderConfig.algo_cfg (\ref AlgoCfg)
@@ -446,7 +453,7 @@
 The main entry point for temporal filtering is \ref av1_temporal_filter().
 This function returns 1 if temporal filtering is successful, otherwise 0.
 When temporal filtering is applied, the filtered frame will be held in
-the frame buffer \ref AV1_COMP.alt_ref_buffer, which is the frame to be
+the frame buffer \ref AV1_PRIMARY.alt_ref_buffer, which is the frame to be
 encoded in the following encoding process.
 
 Almost all temporal filter related code is in av1/encoder/temporal_filter.c
@@ -482,10 +489,12 @@
 The following are the main data structures referenced in this section
 (see also \ref architecture_enc_data_structures):
 
+ - \ref AV1_PRIMARY ppi (the primary compressor instance data structure)
+    - \ref AV1_PRIMARY.twopass (\ref TWO_PASS)
+
  - \ref AV1_COMP cpi (the main compressor instance data structure)
     - \ref AV1_COMP.oxcf (\ref AV1EncoderConfig)
     - \ref AV1_COMP.rc (\ref RATE_CONTROL)
-    - \ref AV1_COMP.twopass (\ref TWO_PASS)
     - \ref AV1_COMP.sf (\ref SPEED_FEATURES)
 
  - \ref AV1EncoderConfig (Encoder configuration parameters)
@@ -647,7 +656,7 @@
 As well as \ref RATE_CONTROL.active_worst_quality, the two pass code also
 maintains a record of the actual Q value used to encode previous frames
 at each level in the current pyramid hierarchy
-(\ref RATE_CONTROL.active_best_quality). The function
+(\ref PRIMARY_RATE_CONTROL.active_best_quality). The function
 \ref rc_pick_q_and_bounds(), uses these values to set a permitted Q range
 for each frame.
 
@@ -662,7 +671,7 @@
 few frames.  When using this method, full sequence level statistics are not
 available, but it is possible to collect and use frame or group of frame level
 data to help in the allocation of bits and in defining ARF/GF coding
-hierarchies.  The reader is referred to the \ref AV1_COMP.lap_enabled field
+hierarchies.  The reader is referred to the \ref AV1_PRIMARY.lap_enabled field
 in the main compressor instance (where <b>lap</b> stands for
 <b>look ahead processing</b>). This encoding mode for the most part uses the
 same rate control pathways as two pass VBR encoding.
@@ -731,7 +740,7 @@
 -# Fixed Q with adaptive qp offsets: same qp offset for each pyramid level
    in a given video, but these offsets are adaptive based on video content.
 -# Fixed Q with fixed qp offsets: content-independent fixed qp offsets for
-   each pyramid level. (see \ref get_q_using_fixed_offsets()).
+   each pyramid level.
 
 The reader is also refered to the following functions:
 - \ref av1_rc_pick_q_and_bounds()
@@ -790,7 +799,7 @@
   current and the following GF groups (a total number of MAX_NUM_GF_INTERVALS
   groups) to be the maximum value allowed.</li>
 
-  <li><b>Single pass with look-ahead enabled (\ref AV1_COMP.lap_enabled):</b>
+  <li><b>Single pass with look-ahead enabled (\ref AV1_PRIMARY.lap_enabled):</b>
   look-ahead processing is enabled for single pass, therefore there is a
   limited amount of information available regarding future frames. In this
   case the function will determine the length based on \ref FIRSTPASS_STATS
@@ -829,9 +838,9 @@
 As mentioned, for two-pass encoding, the function \ref
 calculate_gf_length() tries to determine the length of as many as
 MAX_NUM_GF_INTERVALS groups. The decisions are stored in
-\ref RATE_CONTROL.gf_intervals[]. The variables
+\ref PRIMARY_RATE_CONTROL.gf_intervals[]. The variables
 \ref RATE_CONTROL.intervals_till_gf_calculate_due and
-\ref RATE_CONTROL.cur_gf_index help with managing and updating the stored
+\ref PRIMARY_RATE_CONTROL.gf_intervals[] help with managing and updating the stored
 decisions. In the function \ref define_gf_group(), the corresponding
 stored length decision will be used to define the current GF group.
 
@@ -855,7 +864,7 @@
 \ref RATE_CONTROL.intervals_till_gf_calculate_due is zero. If it is, as
 discussed above, \ref calculate_gf_length() is called with original
 maximum length. If it is not zero, then the GF group length value stored
-in \ref RATE_CONTROL.gf_intervals[\ref RATE_CONTROL.cur_gf_index] is used
+in \ref PRIMARY_RATE_CONTROL.gf_intervals[\ref PRIMARY_RATE_CONTROL.cur_gf_index] is used
 (subject to change as discussed above).
 
 \subsection architecture_enc_gf_structure Defining a GF Group's Structure
@@ -868,7 +877,7 @@
 accumulate various stats, using accumulate_this_frame_stats() and
 accumulate_next_frame_stats(). The accumulated statistics are then used to
 determine the use of the use of ALTREF frame along with other properties of the
-GF group. The values of \ref RATE_CONTROL.cur_gf_index, \ref
+GF group. The values of \ref PRIMARY_RATE_CONTROL.cur_gf_index, \ref
 RATE_CONTROL.intervals_till_gf_calculate_due and \ref
 RATE_CONTROL.frames_till_gf_update_due are also updated accordingly.
 
@@ -899,7 +908,7 @@
 zero.
 
 For single pass encodes where look-ahead processing is disabled
-(\ref AV1_COMP.lap_enabled = 0), \ref define_gf_group_pass0() is used
+(\ref AV1_PRIMARY.lap_enabled = 0), \ref define_gf_group_pass0() is used
 instead of \ref define_gf_group().
 
 \subsection architecture_enc_kf_groups Key Frame Groups

diff --git a/docs.cmake b/docs.cmake
index 7973524..b2d3083 100644
--- a/docs.cmake
+++ b/docs.cmake

@@ -25,6 +25,7 @@
     "${AOM_ROOT}/aom/aom_codec.h"
     "${AOM_ROOT}/aom/aom_decoder.h"
     "${AOM_ROOT}/aom/aom_encoder.h"
+    "${AOM_ROOT}/aom/aom_external_partition.h"
     "${AOM_ROOT}/aom/aom_frame_buffer.h"
     "${AOM_ROOT}/aom/aom_image.h"
     "${AOM_ROOT}/aom/aom_integer.h"

diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index 3aea2cf..da36d9f 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c

@@ -271,7 +271,11 @@
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
+#if CONFIG_REALTIME_ONLY
+  res = aom_codec_enc_config_default(encoder, &cfg, 1);
+#else
   res = aom_codec_enc_config_default(encoder, &cfg, 0);
+#endif
   if (res) die_codec(&ecodec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -334,6 +338,12 @@
         die_codec(&ecodec, "Failed to set encoder reference frame");
       printf(" <SET_REF>");
 
+#if CONFIG_REALTIME_ONLY
+      // Set cpu speed in encoder.
+      if (aom_codec_control(&ecodec, AOME_SET_CPUUSED, 7))
+        die_codec(&ecodec, "Failed to set cpu speed");
+#endif
+
       // If set_reference in decoder is commented out, the enc/dec mismatch
       // would be seen.
       if (test_decode) {

diff --git a/examples/av1_dec_fuzzer.dict b/examples/av1_dec_fuzzer.dict
new file mode 100644
index 0000000..fb16388
--- /dev/null
+++ b/examples/av1_dec_fuzzer.dict

@@ -0,0 +1,5 @@
+# IVF Signature + version (bytes 0-5)
+kw1="DKIF\x00\x00"
+
+# AV1 codec fourCC (bytes 8-11)
+kw2="AV01"

diff --git a/examples/build_av1_dec_fuzzer.sh b/examples/build_av1_dec_fuzzer.sh
index 2ceb652..40355ea 100755
--- a/examples/build_av1_dec_fuzzer.sh
+++ b/examples/build_av1_dec_fuzzer.sh

@@ -50,7 +50,7 @@
 EXTRA_C_FLAGS='-UNDEBUG -DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
 cd "${BUILD_DIR}"
 cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
-  -DCONFIG_SCALABILITY=0 -DFORCE_HIGHBITDEPTH_DECODING=0 \
+  -DFORCE_HIGHBITDEPTH_DECODING=0 \
   -DCONFIG_AV1_ENCODER=0 -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 \
   -DCONFIG_SIZE_LIMIT=1 -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
   -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
@@ -60,10 +60,10 @@
 make -j$(nproc)
 
 # Build the av1 fuzzer
-$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
-    -g -fsanitize=fuzzer,address -Wl,--start-group \
+$CXX -std=c++11 -I${AOM_DIR} -I${BUILD_DIR} \
+    -g -fsanitize=fuzzer,address \
     ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
-    ${BUILD_DIR}/libaom.a -Wl,--end-group
+    ${BUILD_DIR}/libaom.a
 
 echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
 echo "Create a corpus directory, copy IVF files in there, and run:"

diff --git a/examples/noise_model.c b/examples/noise_model.c
index d07443f..5ff4c04 100644
--- a/examples/noise_model.c
+++ b/examples/noise_model.c

@@ -183,8 +183,8 @@
   grain->bit_depth = raw->bit_depth;
   aom_img_alloc(&renoised, raw->fmt, raw->w, raw->h, 1);
 
-  if (av1_add_film_grain(grain, denoised, &renoised)) {
-    fprintf(stderr, "Internal failure in av1_add_film_grain().\n");
+  if (aom_add_film_grain(grain, denoised, &renoised)) {
+    fprintf(stderr, "Internal failure in aom_add_film_grain().\n");
     aom_img_free(&renoised);
     return;
   }

diff --git a/examples/photon_noise_table.c b/examples/photon_noise_table.c
new file mode 100644
index 0000000..d3a21a4
--- /dev/null
+++ b/examples/photon_noise_table.c

@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// This tool creates a film grain table, for use in stills and videos,
+// representing the noise that one would get by shooting with a digital camera
+// at a given light level. Much of the noise in digital images is photon shot
+// noise, which is due to the characteristics of photon arrival and grows in
+// standard deviation as the square root of the expected number of photons
+// captured.
+// https://www.photonstophotos.net/Emil%20Martinec/noise.html#shotnoise
+//
+// The proxy used by this tool for the amount of light captured is the ISO value
+// such that the focal plane exposure at the time of capture would have been
+// mapped by a 35mm camera to the output lightness observed in the image. That
+// is, if one were to shoot on a 35mm camera (36×24mm sensor) at the nominal
+// exposure for that ISO setting, the resulting image should contain noise of
+// the same order of magnitude as generated by this tool.
+//
+// Example usage:
+//
+//     ./photon_noise_table --width=3840 --height=2160 --iso=25600 -o noise.tbl
+//     # Then, for example:
+//     aomenc --film-grain-table=noise.tbl ...
+//     # Or:
+//     avifenc -c aom -a film-grain-table=noise.tbl ...
+//
+// The (mostly) square-root relationship between light intensity and noise
+// amplitude holds in linear light, but AV1 streams are most often encoded
+// non-linearly, and the film grain is applied to those non-linear values.
+// Therefore, this tool must account for the non-linearity, and this is
+// controlled by the optional `--transfer-function` (or `-t`) parameter, which
+// specifies the tone response curve that will be used when encoding the actual
+// image. The default for this tool is sRGB, which is approximately similar to
+// an encoding gamma of 1/2.2 (i.e. a decoding gamma of 2.2) though not quite
+// identical.
+//
+// As alluded to above, the tool assumes that the image is taken from the
+// entirety of a 36×24mm (“35mm format”) sensor. If that assumption does not
+// hold, then a “35mm-equivalent ISO value” that can be passed to the tool can
+// be obtained by multiplying the true ISO value by the ratio of 36×24mm to the
+// area that was actually used. For formats that approximately share the same
+// aspect ratio, this is often expressed as the square of the “equivalence
+// ratio” which is the ratio of their diagonals. For example, APS-C (often
+// ~24×16mm) is said to have an equivalence ratio of 1.5 relative to the 35mm
+// format, and therefore ISO 1000 on APS-C and ISO 1000×1.5² = 2250 on 35mm
+// produce an image of the same lightness from the same amount of light spread
+// onto their respective surface areas (resulting in different focal plane
+// exposures), and those images will thus have similar amounts of noise if the
+// cameras are of similar technology. https://doi.org/10.1117/1.OE.57.11.110801
+//
+// The tool needs to know the resolution of the images to which its grain tables
+// will be applied so that it can know how the light on the sensor was shared
+// between its pixels. As a general rule, while a higher pixel count will lead
+// to more noise per pixel, when the final image is viewed at the same physical
+// size, that noise will tend to “average out” to the same amount over a given
+// area, since there will be more pixels in it which, in aggregate, will have
+// received essentially as much light. Put differently, the amount of noise
+// depends on the scale at which it is measured, and the decision for this tool
+// was to make that scale relative to the image instead of its constituent
+// samples. For more on this, see:
+//
+// https://www.photonstophotos.net/Emil%20Martinec/noise-p3.html#pixelsize
+// https://www.dpreview.com/articles/5365920428/the-effect-of-pixel-and-sensor-sizes-on-noise/2
+// https://www.dpreview.com/videos/7940373140/dpreview-tv-why-lower-resolution-sensors-are-not-better-in-low-light
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/grain_table.h"
+#include "common/args.h"
+#include "common/tools_common.h"
+
+static const char *exec_name;
+
+static const struct arg_enum_list transfer_functions[] = {
+  { "bt470m", AOM_CICP_TC_BT_470_M }, { "bt470bg", AOM_CICP_TC_BT_470_B_G },
+  { "srgb", AOM_CICP_TC_SRGB },       { "smpte2084", AOM_CICP_TC_SMPTE_2084 },
+  { "hlg", AOM_CICP_TC_HLG },         ARG_ENUM_LIST_END
+};
+
+static arg_def_t help_arg =
+    ARG_DEF("h", "help", 0, "Show the available options");
+static arg_def_t width_arg =
+    ARG_DEF("w", "width", 1, "Width of the image in pixels (required)");
+static arg_def_t height_arg =
+    ARG_DEF("l", "height", 1, "Height of the image in pixels (required)");
+static arg_def_t iso_arg = ARG_DEF(
+    "i", "iso", 1, "ISO setting indicative of the light level (required)");
+static arg_def_t output_arg =
+    ARG_DEF("o", "output", 1,
+            "Output file to which to write the film grain table (required)");
+static arg_def_t transfer_function_arg =
+    ARG_DEF_ENUM("t", "transfer-function", 1,
+                 "Transfer function used by the encoded image (default = sRGB)",
+                 transfer_functions);
+
+void usage_exit(void) {
+  fprintf(stderr,
+          "Usage: %s [--transfer-function=<tf>] --width=<width> "
+          "--height=<height> --iso=<iso> --output=<output.tbl>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+typedef struct {
+  float (*to_linear)(float);
+  float (*from_linear)(float);
+  // In linear output light. This would typically be 0.18 for SDR (this matches
+  // the definition of Standard Output Sensitivity from ISO 12232:2019), but in
+  // HDR, we certainly do not want to consider 18% of the maximum output a
+  // “mid-tone”, as it would be e.g. 1800 cd/m² for SMPTE ST 2084 (PQ).
+  float mid_tone;
+} transfer_function_t;
+
+static const transfer_function_t *find_transfer_function(
+    aom_transfer_characteristics_t tc);
+
+typedef struct {
+  int width;
+  int height;
+  int iso_setting;
+
+  const transfer_function_t *transfer_function;
+
+  const char *output_filename;
+} photon_noise_args_t;
+
+static void parse_args(int argc, char **argv,
+                       photon_noise_args_t *photon_noise_args) {
+  static const arg_def_t *args[] = { &help_arg,   &width_arg,
+                                     &height_arg, &iso_arg,
+                                     &output_arg, &transfer_function_arg,
+                                     NULL };
+  struct arg arg;
+  int width_set = 0, height_set = 0, iso_set = 0, output_set = 0, i;
+
+  photon_noise_args->transfer_function =
+      find_transfer_function(AOM_CICP_TC_SRGB);
+
+  for (i = 1; i < argc; i += arg.argv_step) {
+    arg.argv_step = 1;
+    if (arg_match(&arg, &help_arg, argv + i)) {
+      arg_show_usage(stdout, args);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &width_arg, argv + i)) {
+      photon_noise_args->width = arg_parse_int(&arg);
+      width_set = 1;
+    } else if (arg_match(&arg, &height_arg, argv + i)) {
+      photon_noise_args->height = arg_parse_int(&arg);
+      height_set = 1;
+    } else if (arg_match(&arg, &iso_arg, argv + i)) {
+      photon_noise_args->iso_setting = arg_parse_int(&arg);
+      iso_set = 1;
+    } else if (arg_match(&arg, &output_arg, argv + i)) {
+      photon_noise_args->output_filename = arg.val;
+      output_set = 1;
+    } else if (arg_match(&arg, &transfer_function_arg, argv + i)) {
+      const aom_transfer_characteristics_t tc = arg_parse_enum(&arg);
+      photon_noise_args->transfer_function = find_transfer_function(tc);
+    } else {
+      fatal("unrecognized argument \"%s\", see --help for available options",
+            argv[i]);
+    }
+  }
+
+  if (!width_set) {
+    fprintf(stderr, "Missing required parameter --width\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!height_set) {
+    fprintf(stderr, "Missing required parameter --height\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!iso_set) {
+    fprintf(stderr, "Missing required parameter --iso\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (!output_set) {
+    fprintf(stderr, "Missing required parameter --output\n");
+    exit(EXIT_FAILURE);
+  }
+}
+
+static float maxf(float a, float b) { return a > b ? a : b; }
+static float minf(float a, float b) { return a < b ? a : b; }
+
+static float gamma22_to_linear(float g) { return powf(g, 2.2f); }
+static float gamma22_from_linear(float l) { return powf(l, 1 / 2.2f); }
+static float gamma28_to_linear(float g) { return powf(g, 2.8f); }
+static float gamma28_from_linear(float l) { return powf(l, 1 / 2.8f); }
+
+static float srgb_to_linear(float srgb) {
+  return srgb <= 0.04045f ? srgb / 12.92f
+                          : powf((srgb + 0.055f) / 1.055f, 2.4f);
+}
+static float srgb_from_linear(float linear) {
+  return linear <= 0.0031308f ? 12.92f * linear
+                              : 1.055f * powf(linear, 1 / 2.4f) - 0.055f;
+}
+
+static const float kPqM1 = 2610.f / 16384;
+static const float kPqM2 = 128 * 2523.f / 4096;
+static const float kPqC1 = 3424.f / 4096;
+static const float kPqC2 = 32 * 2413.f / 4096;
+static const float kPqC3 = 32 * 2392.f / 4096;
+static float pq_to_linear(float pq) {
+  const float pq_pow_inv_m2 = powf(pq, 1.f / kPqM2);
+  return powf(maxf(0, pq_pow_inv_m2 - kPqC1) / (kPqC2 - kPqC3 * pq_pow_inv_m2),
+              1.f / kPqM1);
+}
+static float pq_from_linear(float linear) {
+  const float linear_pow_m1 = powf(linear, kPqM1);
+  return powf((kPqC1 + kPqC2 * linear_pow_m1) / (1 + kPqC3 * linear_pow_m1),
+              kPqM2);
+}
+
+// Note: it is perhaps debatable whether “linear” for HLG should be scene light
+// or display light. Here, it is implemented in terms of display light assuming
+// a nominal peak display luminance of 1000 cd/m², hence the system γ of 1.2. To
+// make it scene light instead, the OOTF (powf(x, 1.2f)) and its inverse should
+// be removed from the functions below, and the .mid_tone should be replaced
+// with powf(26.f / 1000, 1 / 1.2f).
+static const float kHlgA = 0.17883277f;
+static const float kHlgB = 0.28466892f;
+static const float kHlgC = 0.55991073f;
+static float hlg_to_linear(float hlg) {
+  // EOTF = OOTF ∘ OETF⁻¹
+  const float linear =
+      hlg <= 0.5f ? hlg * hlg / 3 : (expf((hlg - kHlgC) / kHlgA) + kHlgB) / 12;
+  return powf(linear, 1.2f);
+}
+static float hlg_from_linear(float linear) {
+  // EOTF⁻¹ = OETF ∘ OOTF⁻¹
+  linear = powf(linear, 1.f / 1.2f);
+  return linear <= 1.f / 12 ? sqrtf(3 * linear)
+                            : kHlgA * logf(12 * linear - kHlgB) + kHlgC;
+}
+
+static const transfer_function_t *find_transfer_function(
+    aom_transfer_characteristics_t tc) {
+  static const transfer_function_t
+      kGamma22TransferFunction = { .to_linear = &gamma22_to_linear,
+                                   .from_linear = &gamma22_from_linear,
+                                   .mid_tone = 0.18f },
+      kGamma28TransferFunction = { .to_linear = &gamma28_to_linear,
+                                   .from_linear = &gamma28_from_linear,
+                                   .mid_tone = 0.18f },
+      kSRgbTransferFunction = { .to_linear = &srgb_to_linear,
+                                .from_linear = &srgb_from_linear,
+                                .mid_tone = 0.18f },
+      kPqTransferFunction = { .to_linear = &pq_to_linear,
+                              .from_linear = &pq_from_linear,
+                              // https://www.itu.int/pub/R-REP-BT.2408-4-2021
+                              // page 6 (PDF page 8)
+                              .mid_tone = 26.f / 10000 },
+      kHlgTransferFunction = { .to_linear = &hlg_to_linear,
+                               .from_linear = &hlg_from_linear,
+                               .mid_tone = 26.f / 1000 };
+
+  switch (tc) {
+    case AOM_CICP_TC_BT_470_M: return &kGamma22TransferFunction;
+    case AOM_CICP_TC_BT_470_B_G: return &kGamma28TransferFunction;
+    case AOM_CICP_TC_SRGB: return &kSRgbTransferFunction;
+    case AOM_CICP_TC_SMPTE_2084: return &kPqTransferFunction;
+    case AOM_CICP_TC_HLG: return &kHlgTransferFunction;
+
+    default: fatal("unimplemented transfer function %d", tc);
+  }
+}
+
+static void generate_photon_noise(const photon_noise_args_t *photon_noise_args,
+                                  aom_film_grain_t *film_grain) {
+  // Assumes a daylight-like spectrum.
+  // https://www.strollswithmydog.com/effective-quantum-efficiency-of-sensor/#:~:text=11%2C260%20photons/um%5E2/lx-s
+  static const float kPhotonsPerLxSPerUm2 = 11260;
+
+  // Order of magnitude for cameras in the 2010-2020 decade, taking the CFA into
+  // account.
+  static const float kEffectiveQuantumEfficiency = 0.20f;
+
+  // Also reasonable values for current cameras. The read noise is typically
+  // higher than this at low ISO settings but it matters less there.
+  static const float kPhotoResponseNonUniformity = 0.005f;
+  static const float kInputReferredReadNoise = 1.5f;
+
+  // Focal plane exposure for a mid-tone (typically a 18% reflectance card), in
+  // lx·s.
+  const float mid_tone_exposure = 10.f / photon_noise_args->iso_setting;
+
+  // In microns. Assumes a 35mm sensor (36mm × 24mm).
+  const float pixel_area_um2 = (36000 * 24000.f) / (photon_noise_args->width *
+                                                    photon_noise_args->height);
+
+  const float mid_tone_electrons_per_pixel = kEffectiveQuantumEfficiency *
+                                             kPhotonsPerLxSPerUm2 *
+                                             mid_tone_exposure * pixel_area_um2;
+  const float max_electrons_per_pixel =
+      mid_tone_electrons_per_pixel /
+      photon_noise_args->transfer_function->mid_tone;
+
+  int i;
+
+  film_grain->num_y_points = 14;
+  for (i = 0; i < film_grain->num_y_points; ++i) {
+    float x = i / (film_grain->num_y_points - 1.f);
+    const float linear = photon_noise_args->transfer_function->to_linear(x);
+    const float electrons_per_pixel = max_electrons_per_pixel * linear;
+    // Quadrature sum of the relevant sources of noise, in electrons rms. Photon
+    // shot noise is sqrt(electrons) so we can skip the square root and the
+    // squaring.
+    // https://en.wikipedia.org/wiki/Addition_in_quadrature
+    // https://doi.org/10.1117/3.725073
+    const float noise_in_electrons =
+        sqrtf(kInputReferredReadNoise * kInputReferredReadNoise +
+              electrons_per_pixel +
+              (kPhotoResponseNonUniformity * kPhotoResponseNonUniformity *
+               electrons_per_pixel * electrons_per_pixel));
+    const float linear_noise = noise_in_electrons / max_electrons_per_pixel;
+    const float linear_range_start = maxf(0.f, linear - 2 * linear_noise);
+    const float linear_range_end = minf(1.f, linear + 2 * linear_noise);
+    const float tf_slope =
+        (photon_noise_args->transfer_function->from_linear(linear_range_end) -
+         photon_noise_args->transfer_function->from_linear(
+             linear_range_start)) /
+        (linear_range_end - linear_range_start);
+    float encoded_noise = linear_noise * tf_slope;
+
+    x = roundf(255 * x);
+    encoded_noise = minf(255.f, roundf(255 * 7.88f * encoded_noise));
+
+    film_grain->scaling_points_y[i][0] = (int)x;
+    film_grain->scaling_points_y[i][1] = (int)encoded_noise;
+  }
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+  film_grain->num_cb_points = 0;
+  film_grain->num_cr_points = 0;
+  film_grain->scaling_shift = 8;
+  film_grain->ar_coeff_lag = 0;
+  film_grain->ar_coeffs_cb[0] = 0;
+  film_grain->ar_coeffs_cr[0] = 0;
+  film_grain->ar_coeff_shift = 6;
+  film_grain->cb_mult = 0;
+  film_grain->cb_luma_mult = 0;
+  film_grain->cb_offset = 0;
+  film_grain->cr_mult = 0;
+  film_grain->cr_luma_mult = 0;
+  film_grain->cr_offset = 0;
+  film_grain->overlap_flag = 1;
+  film_grain->random_seed = 7391;
+  film_grain->chroma_scaling_from_luma = 0;
+}
+
+int main(int argc, char **argv) {
+  photon_noise_args_t photon_noise_args;
+  aom_film_grain_table_t film_grain_table;
+  aom_film_grain_t film_grain;
+  struct aom_internal_error_info error_info;
+  memset(&photon_noise_args, 0, sizeof(photon_noise_args));
+  memset(&film_grain_table, 0, sizeof(film_grain_table));
+  memset(&film_grain, 0, sizeof(film_grain));
+  memset(&error_info, 0, sizeof(error_info));
+
+  exec_name = argv[0];
+  parse_args(argc, argv, &photon_noise_args);
+
+  generate_photon_noise(&photon_noise_args, &film_grain);
+  aom_film_grain_table_append(&film_grain_table, 0, 9223372036854775807ull,
+                              &film_grain);
+  if (aom_film_grain_table_write(&film_grain_table,
+                                 photon_noise_args.output_filename,
+                                 &error_info) != AOM_CODEC_OK) {
+    aom_film_grain_table_free(&film_grain_table);
+    fprintf(stderr, "Failed to write film grain table");
+    if (error_info.has_detail) {
+      fprintf(stderr, ": %s", error_info.detail);
+    }
+    fprintf(stderr, "\n");
+    return EXIT_FAILURE;
+  }
+  aom_film_grain_table_free(&film_grain_table);
+
+  return EXIT_SUCCESS;
+}

diff --git a/examples/set_maps.c b/examples/set_maps.c
index 69b4bcc..5a84faa 100644
--- a/examples/set_maps.c
+++ b/examples/set_maps.c

@@ -129,6 +129,14 @@
   const int fps = 2;  // TODO(dkovalev) add command line argument
   const double bits_per_pixel_per_frame = 0.067;
 
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
+
   exec_name = argv[0];
   if (argc != 6) die("Invalid number of arguments");
 
@@ -157,7 +165,7 @@
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder, &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -177,7 +185,7 @@
   if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
     die("Failed to initialize encoder");
 
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.

diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 682fe98..c026706 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c

@@ -163,6 +163,13 @@
   const char *infile_arg = NULL;
   const char *outfile_arg = NULL;
   const char *keyframe_interval_arg = NULL;
+#if CONFIG_REALTIME_ONLY
+  const int usage = 1;
+  const int speed = 7;
+#else
+  const int usage = 0;
+  const int speed = 2;
+#endif
 
   exec_name = argv[0];
 
@@ -204,7 +211,7 @@
 
   printf("Using %s\n", aom_codec_iface_name(encoder));
 
-  res = aom_codec_enc_config_default(encoder, &cfg, 0);
+  res = aom_codec_enc_config_default(encoder, &cfg, usage);
   if (res) die_codec(&codec, "Failed to get default codec config.");
 
   cfg.g_w = info.frame_width;
@@ -223,7 +230,7 @@
   if (aom_codec_enc_init(&codec, encoder, &cfg, 0))
     die("Failed to initialize encoder");
 
-  if (aom_codec_control(&codec, AOME_SET_CPUUSED, 2))
+  if (aom_codec_control(&codec, AOME_SET_CPUUSED, speed))
     die_codec(&codec, "Failed to set cpu-used");
 
   // Encode frames.

diff --git a/examples/svc_encoder_rtc.c b/examples/svc_encoder_rtc.c
index 4eaab5d..e2da3ae 100644
--- a/examples/svc_encoder_rtc.c
+++ b/examples/svc_encoder_rtc.c

@@ -285,7 +285,7 @@
     } else if (arg_match(&arg, &speed_arg, argi)) {
       app_input->speed = arg_parse_uint(&arg);
       if (app_input->speed > 9) {
-        warn("Mapping speed %d to speed 9.\n", app_input->speed);
+        aom_tools_warn("Mapping speed %d to speed 9.\n", app_input->speed);
       }
     } else if (arg_match(&arg, &aqmode_arg, argi)) {
       app_input->aq_mode = arg_parse_uint(&arg);
@@ -556,11 +556,11 @@
 }
 
 // Layer pattern configuration.
-static void set_layer_pattern(int layering_mode, int superframe_cnt,
-                              aom_svc_layer_id_t *layer_id,
-                              aom_svc_ref_frame_config_t *ref_frame_config,
-                              int *use_svc_control, int spatial_layer_id,
-                              int is_key_frame, int ksvc_mode) {
+static void set_layer_pattern(
+    int layering_mode, int superframe_cnt, aom_svc_layer_id_t *layer_id,
+    aom_svc_ref_frame_config_t *ref_frame_config,
+    aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int *use_svc_control,
+    int spatial_layer_id, int is_key_frame, int ksvc_mode, int speed) {
   int i;
   int enable_longterm_temporal_ref = 1;
   int shift = (layering_mode == 8) ? 2 : 0;
@@ -568,7 +568,10 @@
   layer_id->spatial_layer_id = spatial_layer_id;
   int lag_index = 0;
   int base_count = superframe_cnt >> 2;
-  // Set the referende map buffer idx for the 7 references:
+  ref_frame_comp_pred->use_comp_pred[0] = 0;  // GOLDEN_LAST
+  ref_frame_comp_pred->use_comp_pred[1] = 0;  // LAST2_LAST
+  ref_frame_comp_pred->use_comp_pred[2] = 0;  // ALTREF_LAST
+  // Set the reference map buffer idx for the 7 references:
   // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
   // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
   for (i = 0; i < INTER_REFS_PER_FRAME; i++) ref_frame_config->ref_idx[i] = i;
@@ -643,8 +646,12 @@
 
       // Keep golden fixed at slot 3.
       ref_frame_config->ref_idx[SVC_GOLDEN_FRAME] = 3;
-      // Cyclically refresh slots 4, 5, 6, 7, for lag altref.
-      lag_index = 4 + (base_count % 4);
+      // Cyclically refresh slots 5, 6, 7, for lag altref.
+      lag_index = 5;
+      if (base_count > 0) {
+        lag_index = 5 + (base_count % 3);
+        if (superframe_cnt % 4 != 0) lag_index = 5 + ((base_count + 1) % 3);
+      }
       // Set the altref slot to lag_index.
       ref_frame_config->ref_idx[SVC_ALTREF_FRAME] = lag_index;
       if (superframe_cnt % 4 == 0) {
@@ -678,6 +685,8 @@
       // Every frame can reference GOLDEN AND ALTREF.
       ref_frame_config->reference[SVC_GOLDEN_FRAME] = 1;
       ref_frame_config->reference[SVC_ALTREF_FRAME] = 1;
+      // Allow for compound prediction using LAST and ALTREF.
+      if (speed >= 7) ref_frame_comp_pred->use_comp_pred[2] = 1;
       break;
     case 4:
       // 3-temporal layer: but middle layer updates GF, so 2nd TL2 will
@@ -1071,6 +1080,7 @@
   aom_svc_layer_id_t layer_id;
   aom_svc_params_t svc_params;
   aom_svc_ref_frame_config_t ref_frame_config;
+  aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred;
 
 #if CONFIG_INTERNAL_STATS
   FILE *stats_file = fopen("opsnr.stt", "a");
@@ -1105,11 +1115,12 @@
   app_input.input_ctx.framerate.denominator = 1;
   app_input.input_ctx.only_i420 = 1;
   app_input.input_ctx.bit_depth = 0;
+  app_input.speed = 7;
   exec_name = argv[0];
 
   // start with default encoder configuration
-  aom_codec_err_t res =
-      aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg, 0);
+  aom_codec_err_t res = aom_codec_enc_config_default(aom_codec_av1_cx(), &cfg,
+                                                     AOM_USAGE_REALTIME);
   if (res) {
     die("Failed to get config: %s\n", aom_codec_err_to_string(res));
   }
@@ -1237,12 +1248,16 @@
   aom_codec_control(&codec, AV1E_SET_AQ_MODE, app_input.aq_mode ? 3 : 0);
   aom_codec_control(&codec, AV1E_SET_GF_CBR_BOOST_PCT, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_CDEF, 1);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_WARPED_MOTION, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_OBMC, 0);
+  aom_codec_control(&codec, AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_ORDER_HINT, 0);
   aom_codec_control(&codec, AV1E_SET_ENABLE_TPL_MODEL, 0);
   aom_codec_control(&codec, AV1E_SET_DELTAQ_MODE, 0);
   aom_codec_control(&codec, AV1E_SET_COEFF_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_MODE_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_MV_COST_UPD_FREQ, 3);
+  aom_codec_control(&codec, AV1E_SET_DV_COST_UPD_FREQ, 3);
   aom_codec_control(&codec, AV1E_SET_CDF_UPDATE_MODE, 1);
   aom_codec_control(&codec, AV1E_SET_TILE_COLUMNS,
                     cfg.g_threads ? get_msb(cfg.g_threads) : 0);
@@ -1296,12 +1311,16 @@
         // Set the reference/update flags, layer_id, and reference_map
         // buffer index.
         set_layer_pattern(app_input.layering_mode, frame_cnt, &layer_id,
-                          &ref_frame_config, &use_svc_control, slx,
-                          is_key_frame, (app_input.layering_mode == 10));
+                          &ref_frame_config, &ref_frame_comp_pred,
+                          &use_svc_control, slx, is_key_frame,
+                          (app_input.layering_mode == 10), app_input.speed);
         aom_codec_control(&codec, AV1E_SET_SVC_LAYER_ID, &layer_id);
-        if (use_svc_control)
+        if (use_svc_control) {
           aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_CONFIG,
                             &ref_frame_config);
+          aom_codec_control(&codec, AV1E_SET_SVC_REF_FRAME_COMP_PRED,
+                            &ref_frame_comp_pred);
+        }
       } else {
         // Only up to 3 temporal layers supported in fixed mode.
         // Only need to set spatial and temporal layer_id: reference

diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 4e30f55..2bbc3b6 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc

@@ -38,6 +38,9 @@
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
     } else if (video->frame() == 3) {
       aom_active_map_t map = aom_active_map_t();
       /* clang-format off */
@@ -87,14 +90,6 @@
 
 TEST_P(ActiveMapTest, Test) { DoTest(); }
 
-class ActiveMapTestLarge : public ActiveMapTest {};
-
-TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
-
-AV1_INSTANTIATE_TEST_SUITE(ActiveMapTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Range(0, 5));
-
 AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libaom_test::kRealTime),
                            ::testing::Range(5, 9));

diff --git a/test/altref_test.cc b/test/altref_test.cc
index 1334b4a..002a206 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc

@@ -133,9 +133,7 @@
   { ::libaom_test::kTwoPassGood, 5, 10 },
   { ::libaom_test::kTwoPassGood, 8, 16 },
   { ::libaom_test::kTwoPassGood, 16, 32 },
-  // disabled below test case because it causes failure
-  // TODO(anyone): enable below test case once issue is fixed.
-  // { ::libaom_test::kTwoPassGood, 20, 32 },
+  { ::libaom_test::kTwoPassGood, 20, 32 },
 };
 
 // This class is used to test if the gf interval bounds configured by the user

diff --git a/test/aom_image_test.cc b/test/aom_image_test.cc
new file mode 100644
index 0000000..7ff6f61
--- /dev/null
+++ b/test/aom_image_test.cc

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_image.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+TEST(AomImageTest, AomImgWrapInvalidAlign) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  aom_image_t img;
+  // Set img_data and img_data_owner to junk values. aom_img_wrap() should
+  // not read these values on failure.
+  img.img_data = (unsigned char *)"";
+  img.img_data_owner = 1;
+
+  aom_img_fmt_t format = AOM_IMG_FMT_I444;
+  // 'align' must be a power of 2 but is not. This causes the aom_img_wrap()
+  // call to fail. The test verifies we do not read the junk values in 'img'.
+  unsigned int align = 31;
+  EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), nullptr);
+}
+
+TEST(AomImageTest, AomImgSetRectOverflow) {
+  const int kWidth = 128;
+  const int kHeight = 128;
+  unsigned char buf[kWidth * kHeight * 3];
+
+  aom_image_t img;
+  aom_img_fmt_t format = AOM_IMG_FMT_I444;
+  unsigned int align = 32;
+  EXPECT_EQ(aom_img_wrap(&img, format, kWidth, kHeight, align, buf), &img);
+
+  EXPECT_EQ(aom_img_set_rect(&img, 0, 0, kWidth, kHeight, 0), 0);
+  // This would result in overflow because -1 is cast to UINT_MAX.
+  EXPECT_NE(aom_img_set_rect(&img, -1, -1, kWidth, kHeight, 0), 0);
+}

diff --git a/test/aomdec.sh b/test/aomdec.sh
index eda18bb..b03c42a 100755
--- a/test/aomdec.sh
+++ b/test/aomdec.sh

@@ -147,12 +147,16 @@
 }
 
 aomdec_tests="aomdec_av1_ivf
-              aomdec_av1_ivf_error_resilient
               aomdec_av1_ivf_multithread
               aomdec_av1_ivf_multithread_row_mt
-              aomdec_aom_ivf_pipe_input
-              aomdec_av1_obu_annexb
-              aomdec_av1_obu_section5
-              aomdec_av1_webm"
+              aomdec_aom_ivf_pipe_input"
+
+if [ ! "$(realtime_only_build)" = "yes" ]; then
+  aomdec_tests="${aomdec_tests}
+                aomdec_av1_ivf_error_resilient
+                aomdec_av1_obu_annexb
+                aomdec_av1_obu_section5
+                aomdec_av1_webm"
+fi
 
 run_tests aomdec_verify_environment "${aomdec_tests}"

diff --git a/test/aomenc.sh b/test/aomenc.sh
index 86fe5f6..ed98313 100755
--- a/test/aomenc.sh
+++ b/test/aomenc.sh

@@ -98,6 +98,24 @@
   fi
 }
 
+aomenc_av1_ivf_rt() {
+  if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
+    local output="${AV1_IVF_FILE}"
+    if [ -e "${AV1_IVF_FILE}" ]; then
+      output="${AOM_TEST_OUTPUT_DIR}/av1_test.ivf"
+    fi
+    aomenc $(yuv_raw_input) \
+      $(aomenc_encode_test_rt_params) \
+      --ivf \
+      --output="${output}" || return 1
+
+    if [ ! -e "${output}" ]; then
+      elog "Output file does not exist."
+      return 1
+    fi
+  fi
+}
+
 aomenc_av1_ivf_use_16bit_internal() {
   if [ "$(aomenc_can_encode_av1)" = "yes" ]; then
     local output="${AV1_IVF_FILE}"
@@ -274,16 +292,21 @@
   fi
 }
 
-aomenc_tests="aomenc_av1_ivf
-              aomenc_av1_obu_annexb
-              aomenc_av1_obu_section5
-              aomenc_av1_webm
-              aomenc_av1_webm_1pass
-              aomenc_av1_ivf_lossless
-              aomenc_av1_ivf_minq0_maxq0
-              aomenc_av1_ivf_use_16bit_internal
-              aomenc_av1_webm_lag5_frames10
-              aomenc_av1_webm_non_square_par
-              aomenc_av1_webm_cdf_update_mode"
+if [ "$(realtime_only_build)" = "yes" ]; then
+  aomenc_tests="aomenc_av1_ivf_rt"
+else
+  aomenc_tests="aomenc_av1_ivf
+                aomenc_av1_ivf_rt
+                aomenc_av1_obu_annexb
+                aomenc_av1_obu_section5
+                aomenc_av1_webm
+                aomenc_av1_webm_1pass
+                aomenc_av1_ivf_lossless
+                aomenc_av1_ivf_minq0_maxq0
+                aomenc_av1_ivf_use_16bit_internal
+                aomenc_av1_webm_lag5_frames10
+                aomenc_av1_webm_non_square_par
+                aomenc_av1_webm_cdf_update_mode"
+fi
 
 run_tests aomenc_verify_environment "${aomenc_tests}"

diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 4e52b55..b4a8b61 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc

@@ -19,6 +19,13 @@
 
 namespace {
 
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
 class AqSegmentTest
     : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
                                                  int>,
@@ -40,6 +47,11 @@
       encoder->Control(AV1E_SET_AQ_MODE, aq_mode_);
       encoder->Control(AV1E_SET_DELTAQ_MODE, deltaq_mode_);
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
     }
   }
 
@@ -69,10 +81,7 @@
 // 3-cyclic_refresh_aq) encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
 
-class AqSegmentTestLarge : public AqSegmentTest {};
-
-TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
-
+#if !CONFIG_REALTIME_ONLY
 // Validate that this delta q mode
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchExtDeltaQ) {
@@ -84,13 +93,18 @@
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
+#endif
 
-AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+AV1_INSTANTIATE_TEST_SUITE(AqSegmentTest, ::testing::ValuesIn(kTestModeParams),
                            ::testing::Range(5, 9), ::testing::Range(0, 4));
+
+#if !CONFIG_REALTIME_ONLY
+class AqSegmentTestLarge : public AqSegmentTest {};
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatch) { DoTest(GET_PARAM(3)); }
+
 AV1_INSTANTIATE_TEST_SUITE(AqSegmentTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(::libaom_test::kOnePassGood),
                            ::testing::Range(3, 5), ::testing::Range(0, 4));
+#endif
 }  // namespace

diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index 0bf47e6..d12f5cc 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc

@@ -56,9 +56,13 @@
 };
 
 const TestEncodeParam kEncodeVectors[] = {
-  { ::libaom_test::kOnePassGood, 2 }, { ::libaom_test::kOnePassGood, 5 },
-  { ::libaom_test::kTwoPassGood, 1 }, { ::libaom_test::kTwoPassGood, 2 },
-  { ::libaom_test::kTwoPassGood, 5 }, { ::libaom_test::kRealTime, 5 },
+#if CONFIG_REALTIME_ONLY
+  { ::libaom_test::kRealTime, 5 },
+#else
+  { ::libaom_test::kRealTime, 5 },    { ::libaom_test::kOnePassGood, 2 },
+  { ::libaom_test::kOnePassGood, 5 }, { ::libaom_test::kTwoPassGood, 1 },
+  { ::libaom_test::kTwoPassGood, 2 }, { ::libaom_test::kTwoPassGood, 5 },
+#endif
 };
 
 const int kMinArfVectors[] = {

diff --git a/test/av1_convolve_scale_test.cc b/test/av1_convolve_scale_test.cc
index a1c5746..818ca74 100644
--- a/test/av1_convolve_scale_test.cc
+++ b/test/av1_convolve_scale_test.cc

@@ -18,7 +18,6 @@
 
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -258,7 +257,7 @@
  public:
   ConvolveScaleTestBase() : image_(NULL) {}
   virtual ~ConvolveScaleTestBase() { delete image_; }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't
@@ -293,8 +292,8 @@
       convolve_params_.do_average = do_average;
     } else {
       convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
-      convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
-      convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
+      convolve_params_.fwd_offset = quant_dist_lookup_table[j][i];
+      convolve_params_.bck_offset = quant_dist_lookup_table[j][1 - i];
       convolve_params_.is_compound = is_compound;
       convolve_params_.do_average = do_average;
     }

diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 0c90280..accee58 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc

@@ -15,7 +15,6 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "aom_ports/aom_timer.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -190,7 +189,7 @@
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }
 
-  virtual void TearDown() override { libaom_test::ClearSystemState(); }
+  virtual void TearDown() override {}
 
   // Randomizes the 8-bit input buffer and returns a pointer to it. Note that
   // the pointer is safe to use with an 8-tap filter. The stride can range
@@ -1172,8 +1171,8 @@
   result.push_back(CompoundParam(false, 0, 0));
   for (int k = 0; k < 2; ++k) {
     for (int l = 0; l < 4; ++l) {
-      result.push_back(CompoundParam(true, quant_dist_lookup_table[k][l][0],
-                                     quant_dist_lookup_table[k][l][1]));
+      result.push_back(CompoundParam(true, quant_dist_lookup_table[l][k],
+                                     quant_dist_lookup_table[l][1 - k]));
     }
   }
   return result;

diff --git a/test/av1_external_partition_test.cc b/test/av1_external_partition_test.cc
new file mode 100644
index 0000000..177c9d9
--- /dev/null
+++ b/test/av1_external_partition_test.cc

@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <fstream>
+#include <new>
+#include <sstream>
+#include <string>
+
+#include "aom/aom_codec.h"
+#include "aom/aom_external_partition.h"
+#include "av1/common/blockd.h"
+#include "av1/encoder/encodeframe_utils.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/util.h"
+
+#if CONFIG_AV1_ENCODER
+#if !CONFIG_REALTIME_ONLY
+namespace {
+
+constexpr int kFrameNum = 8;
+constexpr int kVersion = 1;
+
+typedef struct TestData {
+  int version = kVersion;
+} TestData;
+
+typedef struct ToyModel {
+  TestData *data;
+  aom_ext_part_config_t config;
+  aom_ext_part_funcs_t funcs;
+  int mi_row;
+  int mi_col;
+  int frame_width;
+  int frame_height;
+  BLOCK_SIZE block_size;
+} ToyModel;
+
+// Note:
+// if CONFIG_PARTITION_SEARCH_ORDER = 0, we test APIs designed for the baseline
+// encoder's DFS partition search workflow.
+// if CONFIG_PARTITION_SEARCH_ORDER = 1, we test APIs designed for the new
+// ML model's partition search workflow.
+#if CONFIG_PARTITION_SEARCH_ORDER
+aom_ext_part_status_t ext_part_create_model(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  TestData *received_data = reinterpret_cast<TestData *>(priv);
+  EXPECT_EQ(received_data->version, kVersion);
+  ToyModel *toy_model = new (std::nothrow) ToyModel;
+  EXPECT_NE(toy_model, nullptr);
+  toy_model->data = received_data;
+  *ext_part_model = toy_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  toy_model->mi_row = part_features->mi_row;
+  toy_model->mi_col = part_features->mi_col;
+  toy_model->frame_width = part_features->frame_width;
+  toy_model->frame_height = part_features->frame_height;
+  toy_model->block_size = static_cast<BLOCK_SIZE>(part_features->block_size);
+  return AOM_EXT_PART_OK;
+}
+
+// The model provide the whole decision tree to the encoder.
+aom_ext_part_status_t ext_part_get_partition_decision_whole_tree(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  // A toy model that always asks the encoder to encode with
+  // 4x4 blocks (the smallest).
+  ext_part_decision->is_final_decision = 1;
+  // Note: super block size is fixed to BLOCK_64X64 for the
+  // input video. It is determined inside the encoder, see the
+  // check in "ext_part_create_model".
+  const int is_last_sb_col =
+      toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+  const int is_last_sb_row =
+      toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+  if (is_last_sb_row && is_last_sb_col) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the first one will further split)
+    // 16x16: 4 nodes
+    // 8x8:   4 * 4 nodes
+    // 4x4:   4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 4 + 4 * 4 + 4 * 4 * 4;
+    const int num_4x4_blocks = 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the first one will split, the other three are
+    // out of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_NONE;
+    ext_part_decision->partition_decision[3] = PARTITION_NONE;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks inside the top-left 32x32 block.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else if (is_last_sb_row) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the first two will further split)
+    // 16x16: 2 * 4 nodes
+    // 8x8:   2 * 4 * 4 nodes
+    // 4x4:   2 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+    const int num_4x4_blocks = 2 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the first two will split, the other two are out
+    // of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[3] = PARTITION_NONE;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else if (is_last_sb_col) {
+    // 64x64: 1 node
+    // 32x32: 4 nodes (only the top-left and bottom-left will further split)
+    // 16x16: 2 * 4 nodes
+    // 8x8:   2 * 4 * 4 nodes
+    // 4x4:   2 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 2 * 4 + 2 * 4 * 4 + 2 * 4 * 4 * 4;
+    const int num_4x4_blocks = 2 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    // 64x64
+    ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    // 32x32, only the top-left and bottom-left will split, the other two are
+    // out of frame boundary.
+    ext_part_decision->partition_decision[1] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[2] = PARTITION_NONE;
+    ext_part_decision->partition_decision[3] = PARTITION_SPLIT;
+    ext_part_decision->partition_decision[4] = PARTITION_NONE;
+    // The rest blocks.
+    for (int i = 5; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[0] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  } else {
+    // 64x64: 1 node
+    // 32x32: 4 nodes
+    // 16x16: 4 * 4 nodes
+    // 8x8:   4 * 4 * 4 nodes
+    // 4x4:   4 * 4 * 4 * 4 nodes
+    const int num_blocks = 1 + 4 + 4 * 4 + 4 * 4 * 4 + 4 * 4 * 4 * 4;
+    const int num_4x4_blocks = 4 * 4 * 4 * 4;
+    ext_part_decision->num_nodes = num_blocks;
+    for (int i = 0; i < num_blocks - num_4x4_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_SPLIT;
+    }
+    for (int i = num_blocks - num_4x4_blocks; i < num_blocks; ++i) {
+      ext_part_decision->partition_decision[i] = PARTITION_NONE;
+    }
+  }
+
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision_recursive(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  ext_part_decision->current_decision = PARTITION_NONE;
+  ext_part_decision->is_final_decision = 1;
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  // Note: super block size is fixed to BLOCK_64X64 for the
+  // input video. It is determined inside the encoder, see the
+  // check in "ext_part_create_model".
+  const int is_last_sb_col =
+      toy_model->mi_col * 4 + 64 > toy_model->frame_width;
+  const int is_last_sb_row =
+      toy_model->mi_row * 4 + 64 > toy_model->frame_height;
+  if (is_last_sb_row && is_last_sb_col) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else if (is_last_sb_row) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else if (is_last_sb_col) {
+    if (block_size_wide[toy_model->block_size] == 64) {
+      ext_part_decision->current_decision = PARTITION_SPLIT;
+    } else {
+      ext_part_decision->current_decision = PARTITION_NONE;
+    }
+  } else {
+    ext_part_decision->current_decision = PARTITION_NONE;
+  }
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats) {
+  (void)ext_part_model;
+  (void)ext_part_stats;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+    aom_ext_part_model_t ext_part_model) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  EXPECT_EQ(toy_model->data->version, kVersion);
+  delete toy_model;
+  return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestAPI
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ExternalPartitionTestAPI()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+  virtual ~ExternalPartitionTestAPI() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 4;
+    cfg_.rc_target_bitrate = 400;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void SetExternalPartition(bool use_external_partition) {
+    use_external_partition_ = use_external_partition;
+  }
+
+  void SetPartitionControlMode(int mode) { partition_control_mode_ = mode; }
+
+  void SetDecisionMode(aom_ext_part_decision_mode_t mode) {
+    decision_mode_ = mode;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      if (decision_mode_ == AOM_EXT_PART_WHOLE_TREE) {
+        aom_ext_part_funcs_t ext_part_funcs;
+        ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+        ext_part_funcs.decision_mode = AOM_EXT_PART_WHOLE_TREE;
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+        ext_part_funcs.get_partition_decision =
+            ext_part_get_partition_decision_whole_tree;
+        ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+        ext_part_funcs.delete_model = ext_part_delete_model;
+
+        encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        if (use_external_partition_) {
+          encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+        }
+        if (partition_control_mode_ == -1) {
+          encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+          encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+        } else {
+          switch (partition_control_mode_) {
+            case 1:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+              break;
+            case 2:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+              break;
+            default: assert(0 && "Invalid partition control mode."); break;
+          }
+        }
+      } else if (decision_mode_ == AOM_EXT_PART_RECURSIVE) {
+        aom_ext_part_funcs_t ext_part_funcs;
+        ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+        ext_part_funcs.decision_mode = AOM_EXT_PART_RECURSIVE;
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+        ext_part_funcs.get_partition_decision =
+            ext_part_get_partition_decision_recursive;
+        ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+        ext_part_funcs.delete_model = ext_part_delete_model;
+
+        encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        if (use_external_partition_) {
+          encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+        }
+        if (partition_control_mode_ == -1) {
+          encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 128);
+          encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+        } else {
+          switch (partition_control_mode_) {
+            case 1:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 64);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 64);
+              break;
+            case 2:
+              encoder->Control(AV1E_SET_MAX_PARTITION_SIZE, 4);
+              encoder->Control(AV1E_SET_MIN_PARTITION_SIZE, 4);
+              break;
+            default: assert(0 && "Invalid partition control mode."); break;
+          }
+        }
+      } else {
+        assert(0 && "Invalid decision mode.");
+      }
+    }
+  }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  double psnr_;
+  unsigned int nframes_;
+  bool use_external_partition_ = false;
+  TestData test_data_;
+  int partition_control_mode_ = -1;
+  aom_ext_part_decision_mode_t decision_mode_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is a normal encoding run with restricted partition types,
+// i.e., we use control flags to force the encoder to encode with the
+// 4x4 block size.
+// The second run is to get partition decisions from a toy model that we
+// built, which will asks the encoder to encode with the 4x4 blocks.
+// We expect the encoding results are the same.
+TEST_P(ExternalPartitionTestAPI, WholePartitionTree4x4Block) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  SetPartitionControlMode(2);
+  SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  SetPartitionControlMode(2);
+  SetDecisionMode(AOM_EXT_PART_WHOLE_TREE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+TEST_P(ExternalPartitionTestAPI, RecursivePartition) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  SetPartitionControlMode(1);
+  SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  SetPartitionControlMode(1);
+  SetDecisionMode(AOM_EXT_PART_RECURSIVE);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  const double psnr_thresh = 0.02;
+  EXPECT_NEAR(psnr, psnr2, psnr_thresh);
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestAPI,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(4));  // cpu_used
+
+#else   // !CONFIG_PARTITION_SEARCH_ORDER
+// Feature files written during encoding, as defined in partition_strategy.c.
+std::string feature_file_names[] = {
+  "feature_before_partition_none",
+  "feature_before_partition_none_prune_rect",
+  "feature_after_partition_none_prune",
+  "feature_after_partition_none_terminate",
+  "feature_after_partition_split_terminate",
+  "feature_after_partition_split_prune_rect",
+  "feature_after_partition_rect",
+  "feature_after_partition_ab",
+};
+
+// Files written here in the test, where the feature data is received
+// from the API.
+std::string test_feature_file_names[] = {
+  "test_feature_before_partition_none",
+  "test_feature_before_partition_none_prune_rect",
+  "test_feature_after_partition_none_prune",
+  "test_feature_after_partition_none_terminate",
+  "test_feature_after_partition_split_terminate",
+  "test_feature_after_partition_split_prune_rect",
+  "test_feature_after_partition_rect",
+  "test_feature_after_partition_ab",
+};
+
+static void write_features_to_file(const float *features,
+                                   const int feature_size, const int id) {
+  if (!WRITE_FEATURE_TO_FILE) return;
+  char filename[256];
+  snprintf(filename, sizeof(filename), "%s",
+           test_feature_file_names[id].c_str());
+  FILE *pfile = fopen(filename, "a");
+  ASSERT_NE(pfile, nullptr);
+  for (int i = 0; i < feature_size; ++i) {
+    fprintf(pfile, "%.6f", features[i]);
+    if (i < feature_size - 1) fprintf(pfile, ",");
+  }
+  fprintf(pfile, "\n");
+  fclose(pfile);
+}
+
+aom_ext_part_status_t ext_part_create_model(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  TestData *received_data = reinterpret_cast<TestData *>(priv);
+  EXPECT_EQ(received_data->version, kVersion);
+  ToyModel *toy_model = new (std::nothrow) ToyModel;
+  EXPECT_NE(toy_model, nullptr);
+  toy_model->data = received_data;
+  *ext_part_model = toy_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_create_model_test(
+    void *priv, const aom_ext_part_config_t *part_config,
+    aom_ext_part_model_t *ext_part_model) {
+  (void)priv;
+  (void)ext_part_model;
+  EXPECT_EQ(part_config->superblock_size, BLOCK_64X64);
+  // Return status indicates it's a encoder test. It lets the encoder
+  // set a flag and write partition features to text files.
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_send_features(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  (void)part_features;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_send_features_test(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_features_t *part_features) {
+  (void)ext_part_model;
+  if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE) {
+    write_features_to_file(part_features->before_part_none.f,
+                           AOM_EXT_PART_SIZE_DIRECT_SPLIT, 0);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_BEFORE_NONE_PART2) {
+    write_features_to_file(part_features->before_part_none.f_part2,
+                           AOM_EXT_PART_SIZE_PRUNE_PART, 1);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE) {
+    write_features_to_file(part_features->after_part_none.f,
+                           AOM_EXT_PART_SIZE_PRUNE_NONE, 2);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_NONE_PART2) {
+    write_features_to_file(part_features->after_part_none.f_terminate,
+                           AOM_EXT_PART_SIZE_TERM_NONE, 3);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT) {
+    write_features_to_file(part_features->after_part_split.f_terminate,
+                           AOM_EXT_PART_SIZE_TERM_SPLIT, 4);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_SPLIT_PART2) {
+    write_features_to_file(part_features->after_part_split.f_prune_rect,
+                           AOM_EXT_PART_SIZE_PRUNE_RECT, 5);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_RECT) {
+    write_features_to_file(part_features->after_part_rect.f,
+                           AOM_EXT_PART_SIZE_PRUNE_AB, 6);
+  } else if (part_features->id == AOM_EXT_PART_FEATURE_AFTER_AB) {
+    write_features_to_file(part_features->after_part_ab.f,
+                           AOM_EXT_PART_SIZE_PRUNE_4_WAY, 7);
+  }
+  return AOM_EXT_PART_TEST;
+}
+
+aom_ext_part_status_t ext_part_get_partition_decision(
+    aom_ext_part_model_t ext_part_model,
+    aom_partition_decision_t *ext_part_decision) {
+  (void)ext_part_model;
+  (void)ext_part_decision;
+  // Return an invalid decision such that the encoder doesn't take any
+  // partition decision from the ml model.
+  return AOM_EXT_PART_ERROR;
+}
+
+aom_ext_part_status_t ext_part_send_partition_stats(
+    aom_ext_part_model_t ext_part_model,
+    const aom_partition_stats_t *ext_part_stats) {
+  (void)ext_part_model;
+  (void)ext_part_stats;
+  return AOM_EXT_PART_OK;
+}
+
+aom_ext_part_status_t ext_part_delete_model(
+    aom_ext_part_model_t ext_part_model) {
+  ToyModel *toy_model = static_cast<ToyModel *>(ext_part_model);
+  EXPECT_EQ(toy_model->data->version, kVersion);
+  delete toy_model;
+  return AOM_EXT_PART_OK;
+}
+
+class ExternalPartitionTestDfsAPI
+    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  ExternalPartitionTestDfsAPI()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+  virtual ~ExternalPartitionTestDfsAPI() {}
+
+  virtual void SetUp() {
+    InitializeConfig(encoding_mode_);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 1;
+    cfg_.g_lag_in_frames = 4;
+    cfg_.rc_target_bitrate = 400;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual bool DoDecode() const { return false; }
+
+  virtual void BeginPassHook(unsigned int) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  void SetExternalPartition(bool use_external_partition) {
+    use_external_partition_ = use_external_partition;
+  }
+
+  void SetTestSendFeatures(int test_send_features) {
+    test_send_features_ = test_send_features;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      aom_ext_part_funcs_t ext_part_funcs;
+      ext_part_funcs.priv = reinterpret_cast<void *>(&test_data_);
+      if (use_external_partition_) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      if (test_send_features_ == 1) {
+        ext_part_funcs.create_model = ext_part_create_model;
+        ext_part_funcs.send_features = ext_part_send_features_test;
+      } else if (test_send_features_ == 0) {
+        ext_part_funcs.create_model = ext_part_create_model_test;
+        ext_part_funcs.send_features = ext_part_send_features;
+      }
+      ext_part_funcs.get_partition_decision = ext_part_get_partition_decision;
+      ext_part_funcs.send_partition_stats = ext_part_send_partition_stats;
+      ext_part_funcs.delete_model = ext_part_delete_model;
+
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      if (use_external_partition_) {
+        encoder->Control(AV1E_SET_EXTERNAL_PARTITION, &ext_part_funcs);
+      }
+    }
+  }
+
+ private:
+  libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  double psnr_;
+  unsigned int nframes_;
+  bool use_external_partition_ = false;
+  int test_send_features_ = -1;
+  TestData test_data_;
+};
+
+// Encode twice and expect the same psnr value.
+// The first run is the baseline without external partition.
+// The second run is to get partition decisions from the toy model we defined.
+// Here, we let the partition decision return invalid for all stages.
+// In this case, the external partition doesn't alter the original encoder
+// behavior. So we expect the same encoding results.
+TEST_P(ExternalPartitionTestDfsAPI, EncodeMatch) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(false);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr = GetAveragePsnr();
+
+  SetExternalPartition(true);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr2 = GetAveragePsnr();
+
+  EXPECT_DOUBLE_EQ(psnr, psnr2);
+}
+
+// Encode twice to compare generated feature files.
+// The first run let the encoder write partition features to file.
+// The second run calls send partition features function to send features to
+// the external model, and we write them to file.
+// The generated files should match each other.
+TEST_P(ExternalPartitionTestDfsAPI, SendFeatures) {
+  ::libaom_test::Y4mVideoSource video("paris_352_288_30.y4m", 0, kFrameNum);
+  SetExternalPartition(true);
+  SetTestSendFeatures(0);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  SetExternalPartition(true);
+  SetTestSendFeatures(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  if (!WRITE_FEATURE_TO_FILE) return;
+
+  // Compare feature files by reading them into strings.
+  for (int i = 0; i < 8; ++i) {
+    std::ifstream base_file(feature_file_names[i]);
+    ASSERT_TRUE(base_file.good());
+    std::stringstream base_stream;
+    base_stream << base_file.rdbuf();
+    std::string base_string = base_stream.str();
+
+    std::ifstream test_file(test_feature_file_names[i]);
+    ASSERT_TRUE(test_file.good());
+    std::stringstream test_stream;
+    test_stream << test_file.rdbuf();
+    std::string test_string = test_stream.str();
+
+    EXPECT_STREQ(base_string.c_str(), test_string.c_str());
+  }
+
+  // Remove files.
+  std::string command("rm -f feature_* test_feature_*");
+  system(command.c_str());
+}
+
+AV1_INSTANTIATE_TEST_SUITE(ExternalPartitionTestDfsAPI,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(4));  // cpu_used
+#endif  // CONFIG_PARTITION_SEARCH_ORDER
+
+}  // namespace
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_ENCODER

diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
index 0e7eb09..d124330 100644
--- a/test/av1_fwd_txfm2d_test.cc
+++ b/test/av1_fwd_txfm2d_test.cc

@@ -362,6 +362,78 @@
 TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
   AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
 }
+TEST(AV1FwdTxfm2dTest, DCTScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 64, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/0, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
+TEST(AV1FwdTxfm2dTest, HadamardScaleTest) {
+  BitDepthInfo bd_info;
+  bd_info.bit_depth = 8;
+  bd_info.use_highbitdepth_buf = 0;
+  DECLARE_ALIGNED(32, int16_t, src_diff[1024]);
+  DECLARE_ALIGNED(32, tran_low_t, coeff[1024]);
+
+  const TX_SIZE tx_size_list[4] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32 };
+  const int stride_list[4] = { 4, 8, 16, 32 };
+  const int ref_scale_list[4] = { 1, 64, 64, 16 };
+
+  for (int i = 0; i < 4; i++) {
+    TX_SIZE tx_size = tx_size_list[i];
+    int stride = stride_list[i];
+    int array_size = stride * stride;
+
+    for (int i = 0; i < array_size; i++) {
+      src_diff[i] = 8;
+      coeff[i] = 0;
+    }
+
+    av1_quick_txfm(/*use_hadamard=*/1, tx_size, bd_info, src_diff, stride,
+                   coeff);
+
+    double input_sse = 0;
+    double output_sse = 0;
+    for (int i = 0; i < array_size; i++) {
+      input_sse += pow(src_diff[i], 2);
+      output_sse += pow(coeff[i], 2);
+    }
+
+    double scale = output_sse / input_sse;
+
+    EXPECT_NEAR(scale, ref_scale_list[i], 5);
+  }
+}
 using ::testing::Combine;
 using ::testing::Values;
 using ::testing::ValuesIn;
@@ -580,8 +652,10 @@
 #if HAVE_SSE4_1
 static TX_SIZE Highbd_fwd_txfm_for_sse4_1[] = {
   TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_64X64, TX_4X8,   TX_8X4,
-  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32, TX_4X16,
-  TX_16X4, TX_8X32, TX_32X8,  TX_16X64, TX_64X16,
+  TX_8X16, TX_16X8, TX_16X32, TX_32X16, TX_32X64, TX_64X32,
+#if !CONFIG_REALTIME_ONLY
+  TX_4X16, TX_16X4, TX_8X32,  TX_32X8,  TX_16X64, TX_64X16,
+#endif
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,

diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index a576c0f..29d7f89 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc

@@ -17,7 +17,6 @@
 
 #include "test/acm_random.h"
 #include "test/av1_txfm_test.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/enums.h"
@@ -94,7 +93,6 @@
     aom_free(coeffs_);
     aom_free(output_);
     aom_free(output_ref_);
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -146,7 +144,7 @@
 
     txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
     inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
 
     for (int j = 0; j < num_coeffs_; ++j) {
@@ -210,6 +208,11 @@
 void AV1HighbdInvTxfm2d::RunAV1InvTxfm2dTest(TX_TYPE tx_type_, TX_SIZE tx_size_,
                                              int run_times, int bit_depth_,
                                              int gt_int16) {
+#if CONFIG_REALTIME_ONLY
+  if (tx_size_ >= TX_4X16) {
+    return;
+  }
+#endif
   FwdTxfm2dFunc fwd_func_ = libaom_test::fwd_txfm_func_ls[tx_size_];
   TxfmParam txfm_param;
   const int BLK_WIDTH = 64;

diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
index 115fc84..2816226 100644
--- a/test/av1_horz_only_frame_superres_test.cc
+++ b/test/av1_horz_only_frame_superres_test.cc

@@ -20,7 +20,6 @@
 #include "av1/common/convolve.h"
 #include "av1/common/resize.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -164,7 +163,7 @@
  public:
   ConvolveHorizRSTestBase() : image_(NULL) {}
   virtual ~ConvolveHorizRSTestBase() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   // Implemented by subclasses (SetUp depends on the parameters passed
   // in and RunOne depends on the function to be tested. These can't

diff --git a/test/av1_k_means_test.cc b/test/av1_k_means_test.cc
index d55c692..ed0c6a6 100644
--- a/test/av1_k_means_test.cc
+++ b/test/av1_k_means_test.cc

@@ -23,7 +23,6 @@
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
 #include "av1/encoder/palette.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -86,7 +85,7 @@
   }
 }
 
-void AV1KmeansTest1::TearDown() { libaom_test::ClearSystemState(); }
+void AV1KmeansTest1::TearDown() {}
 
 void AV1KmeansTest1::RunCheckOutput(av1_calc_indices_dim1_func test_impl,
                                     BLOCK_SIZE bsize, int k) {
@@ -194,7 +193,7 @@
   }
 }
 
-void AV1KmeansTest2::TearDown() { libaom_test::ClearSystemState(); }
+void AV1KmeansTest2::TearDown() {}
 
 void AV1KmeansTest2::RunCheckOutput(av1_calc_indices_dim2_func test_impl,
                                     BLOCK_SIZE bsize, int k) {

diff --git a/test/av1_key_value_api_test.cc b/test/av1_key_value_api_test.cc
index 3d06d2d..058b8ce 100644
--- a/test/av1_key_value_api_test.cc
+++ b/test/av1_key_value_api_test.cc

@@ -29,10 +29,15 @@
 #if CONFIG_AV1_ENCODER
     aom_codec_iface_t *iface_cx = aom_codec_av1_cx();
     aom_codec_enc_cfg_t enc_cfg;
-
+#if CONFIG_REALTIME_ONLY
+    const int usage = 1;
+#else
+    const int usage = 0;
+#endif
     EXPECT_EQ(AOM_CODEC_OK,
-              aom_codec_enc_config_default(iface_cx, &enc_cfg, 0));
-    EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, 0));
+              aom_codec_enc_config_default(iface_cx, &enc_cfg, usage));
+    EXPECT_EQ(AOM_CODEC_OK,
+              aom_codec_enc_init(&enc_, iface_cx, &enc_cfg, usage));
 #endif
 #if CONFIG_AV1_DECODER
     aom_codec_iface_t *iface_dx = aom_codec_av1_dx();

diff --git a/test/av1_nn_predict_test.cc b/test/av1_nn_predict_test.cc
index 2a88027..7a3067d 100644
--- a/test/av1_nn_predict_test.cc
+++ b/test/av1_nn_predict_test.cc

@@ -22,7 +22,6 @@
 #include "test/util.h"
 #include "test/register_state_check.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 
 namespace {
 typedef void (*NnPredict_Func)(const float *const input_nodes,
@@ -73,7 +72,6 @@
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(NnPredictTest);
 
 void NnPredictTest::RunNnPredictTest(const NN_CONFIG *const shape) {
-  libaom_test::ClearSystemState();
   float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
@@ -120,7 +118,6 @@
 
     av1_nn_predict_c(inputs, &nn_config, 0, outputs_ref);
     target_func_(inputs, &nn_config, 0, outputs_test);
-    libaom_test::ClearSystemState();
 
     for (int node = 0; node < shape->num_outputs; node++) {
       if (outputs_ref[node] < epsilon) {
@@ -140,7 +137,6 @@
 
 void NnPredictTest::RunNnPredictSpeedTest(const NN_CONFIG *const shape,
                                           const int run_times) {
-  libaom_test::ClearSystemState();
   float inputs[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_test[NN_MAX_NODES_PER_LAYER] = { 0 };
   float outputs_ref[NN_MAX_NODES_PER_LAYER] = { 0 };
@@ -167,7 +163,6 @@
     target_func_(inputs, &nn_config, 0, outputs_test);
   }
   aom_usec_timer_mark(&timer);
-  libaom_test::ClearSystemState();
   const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
 
   printf("%d", shape->num_inputs);

diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
index f0882c7..ce1311d 100644
--- a/test/av1_quantize_test.cc
+++ b/test/av1_quantize_test.cc

@@ -16,9 +16,9 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "av1/common/scan.h"
+#include "av1/encoder/av1_quantize.h"
 
 namespace {
 
@@ -97,7 +97,7 @@
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
@@ -174,7 +174,7 @@
                   quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
                   &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           quanFunc(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
                    quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
                    scanOrder.scan, scanOrder.iscan, log_scale));
@@ -185,7 +185,7 @@
 
   virtual void SetUp() { params_ = GetParam(); }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   virtual ~AV1QuantizeTest() {}
 
@@ -207,6 +207,32 @@
 TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
 TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
 
+TEST(AV1QuantizeTest, QuantizeFpNoQmatrix) {
+  // Here we use a uniform quantizer as an example
+  const int16_t dequant_ptr[2] = { 78, 93 };  // quantize step
+  const int16_t round_ptr[2] = { 39, 46 };    // round ~= dequant / 2
+
+  // quant ~= 2^16 / dequant. This is a 16-bit fixed point representation of the
+  // inverse of quantize step.
+  const int16_t quant_ptr[2] = { 840, 704 };
+  int log_scale = 0;
+  int coeff_count = 4;
+  const tran_low_t coeff_ptr[4] = { -449, 624, -14, 24 };
+  const tran_low_t ref_qcoeff_ptr[4] = { -6, 7, 0, 0 };
+  const tran_low_t ref_dqcoeff_ptr[4] = { -468, 651, 0, 0 };
+  const int16_t scan[4] = { 0, 1, 2, 3 };
+  tran_low_t qcoeff_ptr[4];
+  tran_low_t dqcoeff_ptr[4];
+  int eob = av1_quantize_fp_no_qmatrix(quant_ptr, dequant_ptr, round_ptr,
+                                       log_scale, scan, coeff_count, coeff_ptr,
+                                       qcoeff_ptr, dqcoeff_ptr);
+  EXPECT_EQ(eob, 2);
+  for (int i = 0; i < coeff_count; ++i) {
+    EXPECT_EQ(qcoeff_ptr[i], ref_qcoeff_ptr[i]);
+    EXPECT_EQ(dqcoeff_ptr[i], ref_dqcoeff_ptr[i]);
+  }
+}
+
 #if HAVE_SSE4_1
 const QuantizeFuncParams qfps[4] = {
   QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1, &av1_highbd_quantize_fp_c,

diff --git a/test/av1_round_shift_array_test.cc b/test/av1_round_shift_array_test.cc
index 07f6b56..effac7e 100644
--- a/test/av1_round_shift_array_test.cc
+++ b/test/av1_round_shift_array_test.cc

@@ -20,7 +20,6 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -43,7 +42,7 @@
   ~AV1CompRoundShiftTest();
 
   void SetUp() { rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
+  void TearDown() {}
 
  protected:
   void RunCheckOutput(comp_round_shift_array_func test_impl, BLOCK_SIZE bsize,

diff --git a/test/av1_softmax_test.cc b/test/av1_softmax_test.cc
new file mode 100644
index 0000000..8a9d820
--- /dev/null
+++ b/test/av1_softmax_test.cc

@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/ml.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+using FastSoftmaxFn = void (*)(const float *const input, float *output);
+using FastSoftmaxTestParams = std::tuple<const FastSoftmaxFn, int>;
+
+// Error thresholds for functional equivalence
+constexpr float kRelEpsilon = 5e-2f;
+constexpr float kAbsEpsilon = 5e-3f;
+
+class FastSoftmaxTest : public ::testing::TestWithParam<FastSoftmaxTestParams> {
+ public:
+  FastSoftmaxTest()
+      : target_fn_{ GET_PARAM(0) }, num_classes_(GET_PARAM(1)),
+        ref_buf_(new float[num_classes_]()),
+        dst_buf_(new float[num_classes_]()), input_(new float[num_classes_]()) {
+  }
+  void RunSoftmaxTest();
+  void RunSoftmaxSpeedTest(const int run_times);
+  void FillInputBuf();
+
+ private:
+  const FastSoftmaxFn target_fn_;
+  const int num_classes_;
+  std::unique_ptr<float[]> ref_buf_, dst_buf_, input_;
+  libaom_test::ACMRandom rng_;
+};
+
+void FastSoftmaxTest::FillInputBuf() {
+  for (int idx = 0; idx < num_classes_; idx++) {
+    input_[idx] = ((float)rng_.Rand31() - (1 << 30)) / (1u << 30);
+  }
+}
+
+void FastSoftmaxTest::RunSoftmaxTest() {
+  av1_nn_softmax(input_.get(), ref_buf_.get(), num_classes_);
+  target_fn_(input_.get(), dst_buf_.get());
+
+  for (int idx = 0; idx < num_classes_; idx++) {
+    if (ref_buf_[idx] < kAbsEpsilon) {
+      ASSERT_LE(dst_buf_[idx], kAbsEpsilon)
+          << "Reference output was near-zero, test output was not" << std::endl;
+    } else {
+      const float error = dst_buf_[idx] - ref_buf_[idx];
+      const float relative_error = fabsf(error / ref_buf_[idx]);
+      ASSERT_LE(relative_error, kRelEpsilon)
+          << "Excessive relative error between reference and test output"
+          << std::endl;
+      ASSERT_LE(error, kAbsEpsilon)
+          << "Excessive absolute error between reference and test output"
+          << std::endl;
+    }
+  }
+}
+
+void FastSoftmaxTest::RunSoftmaxSpeedTest(const int run_times) {
+  aom_usec_timer timer;
+  aom_usec_timer_start(&timer);
+  for (int idx = 0; idx < run_times; idx++) {
+    target_fn_(input_.get(), dst_buf_.get());
+  }
+  aom_usec_timer_mark(&timer);
+  const int64_t time = aom_usec_timer_elapsed(&timer);
+  std::cout << "Test with " << num_classes_ << " classes took " << time
+            << " us." << std::endl;
+}
+
+TEST_P(FastSoftmaxTest, RandomValues) {
+  FillInputBuf();
+  RunSoftmaxTest();
+}
+
+TEST_P(FastSoftmaxTest, DISABLED_Speed) {
+  constexpr int kNumTimes = 1000000;
+  RunSoftmaxSpeedTest(kNumTimes);
+}
+
+void AnchorSoftmax16Fn(const float *input, float *output) {
+  av1_nn_softmax(input, output, 16);
+}
+
+const FastSoftmaxTestParams kArrayParams_c[] = {
+  FastSoftmaxTestParams(AnchorSoftmax16Fn, 16),
+  FastSoftmaxTestParams(av1_nn_fast_softmax_16_c, 16)
+};
+INSTANTIATE_TEST_SUITE_P(C, FastSoftmaxTest,
+                         ::testing::ValuesIn(kArrayParams_c));
+
+#if HAVE_SSE3 && !CONFIG_EXCLUDE_SIMD_MISMATCH
+INSTANTIATE_TEST_SUITE_P(
+    SSE3, FastSoftmaxTest,
+    ::testing::Values(FastSoftmaxTestParams(av1_nn_fast_softmax_16_sse3, 16)));
+#endif
+}  // namespace

diff --git a/test/av1_temporal_denoiser_test.cc b/test/av1_temporal_denoiser_test.cc
index 235a31c..571fd92 100644
--- a/test/av1_temporal_denoiser_test.cc
+++ b/test/av1_temporal_denoiser_test.cc

@@ -17,7 +17,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 #include "test/register_state_check.h"
 
@@ -48,7 +47,7 @@
 
   virtual void SetUp() { bs_ = GET_PARAM(1); }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   BLOCK_SIZE bs_;
@@ -85,11 +84,11 @@
       mc_avg_block[j] = (temp < 0) ? 0 : ((temp > 255) ? 255 : temp);
     }
 
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         av1_denoiser_filter_c(sig_block, 128, mc_avg_block, 128, avg_block_c,
                               128, 0, bs_, motion_magnitude_random));
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 128, mc_avg_block, 128,
+    API_REGISTER_STATE_CHECK(GET_PARAM(0)(sig_block, 128, mc_avg_block, 128,
                                           avg_block_sse2, 128, 0, bs_,
                                           motion_magnitude_random));
 

diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
index 69280b4..a51ce12 100644
--- a/test/av1_wedge_utils_test.cc
+++ b/test/av1_wedge_utils_test.cc

@@ -182,7 +182,7 @@
 
     const uint64_t ref_res = params_.ref_func(r1, d, m, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -212,7 +212,7 @@
 
     const uint64_t ref_res = params_.ref_func(r1, d, m, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -259,7 +259,7 @@
 
     const int ref_res = params_.ref_func(ds, m, N, limit);
     int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -314,7 +314,7 @@
 
     const int ref_res = params_.ref_func(ds, m, N, limit);
     int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -351,7 +351,7 @@
     memset(&d_tst, INT16_MAX, sizeof(d_tst));
 
     params_.ref_func(d_ref, a, b, N);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+    API_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
 
     for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
   }

diff --git a/test/avg_test.cc b/test/avg_test.cc
index f5c9212..11ddafc 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc

@@ -16,7 +16,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -34,7 +33,6 @@
   virtual void TearDown() {
     aom_free(source_data_);
     source_data_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -110,7 +108,7 @@
     }
 
     unsigned int actual;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         actual = GET_PARAM(4)(source_data_ + GET_PARAM(2), source_stride_));
 
     EXPECT_EQ(expected, actual);
@@ -173,8 +171,8 @@
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    API_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
+    API_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch\n";
   }
@@ -232,8 +230,8 @@
 
  protected:
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
-    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
+    API_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));
+    API_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));
     EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";
   }
   void RunSpeedTest() {
@@ -338,7 +336,6 @@
     ref_vector = NULL;
     aom_free(src_vector);
     src_vector = NULL;
-    libaom_test::ClearSystemState();
   }
 
   void FillConstant(int16_t fill_constant_ref, int16_t fill_constant_src) {
@@ -524,25 +521,35 @@
 #endif
 
 typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef ::testing::tuple<int, SatdFunc, SatdFunc> SatdTestParam;
-class SatdTest : public ::testing::Test,
-                 public ::testing::WithParamInterface<SatdTestParam> {
- protected:
-  virtual void SetUp() {
-    satd_size_ = GET_PARAM(0);
-    satd_func_ref_ = GET_PARAM(1);
-    satd_func_simd_ = GET_PARAM(2);
+typedef int (*SatdLpFunc)(const int16_t *coeffs, int length);
 
+template <typename SatdFuncType>
+struct SatdTestParam {
+  SatdTestParam(int s, SatdFuncType f1, SatdFuncType f2)
+      : satd_size(s), func_ref(f1), func_simd(f2) {}
+  int satd_size;
+  SatdFuncType func_ref;
+  SatdFuncType func_simd;
+};
+
+template <typename CoeffType, typename SatdFuncType>
+class SatdTestBase
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<SatdTestParam<SatdFuncType>> {
+ protected:
+  explicit SatdTestBase(const SatdTestParam<SatdFuncType> &func_param) {
+    satd_size_ = func_param.satd_size;
+    satd_func_ref_ = func_param.func_ref;
+    satd_func_simd_ = func_param.func_simd;
+  }
+  virtual void SetUp() {
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    src_ = reinterpret_cast<tran_low_t *>(
+    src_ = reinterpret_cast<CoeffType *>(
         aom_memalign(32, sizeof(*src_) * satd_size_));
     ASSERT_TRUE(src_ != NULL);
   }
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
-  void FillConstant(const tran_low_t val) {
+  virtual void TearDown() { aom_free(src_); }
+  void FillConstant(const CoeffType val) {
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
   void FillRandom() {
@@ -552,19 +559,19 @@
   }
   void Check(int expected) {
     int total_ref;
-    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+    API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
     EXPECT_EQ(expected, total_ref);
 
     int total_simd;
-    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+    API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
     EXPECT_EQ(expected, total_simd);
   }
   void RunComparison() {
     int total_ref;
-    ASM_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
+    API_REGISTER_STATE_CHECK(total_ref = satd_func_ref_(src_, satd_size_));
 
     int total_simd;
-    ASM_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
+    API_REGISTER_STATE_CHECK(total_simd = satd_func_simd_(src_, satd_size_));
 
     EXPECT_EQ(total_ref, total_simd);
   }
@@ -603,12 +610,17 @@
   int satd_size_;
 
  private:
-  tran_low_t *src_;
-  SatdFunc satd_func_ref_;
-  SatdFunc satd_func_simd_;
+  CoeffType *src_;
+  SatdFuncType satd_func_ref_;
+  SatdFuncType satd_func_simd_;
   ACMRandom rnd_;
 };
 
+class SatdTest : public SatdTestBase<tran_low_t, SatdFunc> {
+ public:
+  SatdTest() : SatdTestBase(GetParam()) {}
+};
+
 TEST_P(SatdTest, MinValue) {
   const int kMin = -32640;
   const int expected = -kMin * satd_size_;
@@ -645,13 +657,21 @@
 }
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdTest);
 
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_c),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c, &aom_satd_c)));
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(
     NEON, SatdTest,
-    ::testing::Values(make_tuple(16, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(64, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(256, &aom_satd_c, &aom_satd_neon),
-                      make_tuple(1024, &aom_satd_c, &aom_satd_neon)));
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_neon),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_neon)));
 INSTANTIATE_TEST_SUITE_P(
     NEON, VectorVarTest,
     ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_neon),
@@ -660,4 +680,104 @@
                       make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon)));
 #endif
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_avx2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdTest,
+    ::testing::Values(SatdTestParam<SatdFunc>(16, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(64, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(256, &aom_satd_c, &aom_satd_sse2),
+                      SatdTestParam<SatdFunc>(1024, &aom_satd_c,
+                                              &aom_satd_sse2)));
+#endif
+
+class SatdLpTest : public SatdTestBase<int16_t, SatdLpFunc> {
+ public:
+  SatdLpTest() : SatdTestBase(GetParam()) {}
+};
+
+TEST_P(SatdLpTest, MinValue) {
+  const int kMin = -32640;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+TEST_P(SatdLpTest, MaxValue) {
+  const int kMax = 32640;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+TEST_P(SatdLpTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 205298; break;
+    case 64: expected = 1113950; break;
+    case 256: expected = 4268415; break;
+    case 1024: expected = 16954082; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+TEST_P(SatdLpTest, Match) {
+  FillRandom();
+  RunComparison();
+}
+TEST_P(SatdLpTest, DISABLED_Speed) {
+  FillRandom();
+  RunSpeedTest();
+}
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SatdLpTest);
+
+// Add the following c test to avoid gtest uninitialized warning.
+INSTANTIATE_TEST_SUITE_P(
+    C, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_c),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_c)));
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_neon),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_neon)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_avx2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_avx2)));
+#endif
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, SatdLpTest,
+    ::testing::Values(
+        SatdTestParam<SatdLpFunc>(16, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(64, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(256, &aom_satd_lp_c, &aom_satd_lp_sse2),
+        SatdTestParam<SatdLpFunc>(1024, &aom_satd_lp_c, &aom_satd_lp_sse2)));
+#endif
+
 }  // namespace

diff --git a/test/best_encode.sh b/test/best_encode.sh
index fe31a01..d29fdae 100755
--- a/test/best_encode.sh
+++ b/test/best_encode.sh

@@ -29,7 +29,7 @@
     -p 2 \
     --pass=2 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -48,8 +48,7 @@
     --maxsection-pct=800 \
     --psnr \
     --arnr-maxframes=7 \
-    --arnr-strength=3 \
-    --arnr-type=3
+    --arnr-strength=3
 else
   # No first-pass file found, do 2-pass encode
   aomenc \
@@ -58,7 +57,7 @@
     -p 2 \
     --pass=1 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -79,7 +78,7 @@
     -p 2 \
     --pass=2 \
     --fpf=$f.fpf \
-    --best \
+    --good \
     --cpu-used=0 \
     --target-bitrate=$b \
     --auto-alt-ref=1 \
@@ -98,6 +97,5 @@
     --maxsection-pct=800 \
     --psnr \
     --arnr-maxframes=7 \
-    --arnr-strength=3 \
-    --arnr-type=3
+    --arnr-strength=3
 fi

diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index 1b6350c..9a95987 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc

@@ -125,7 +125,7 @@
     params_.ref_func(dst_ref_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
                      src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_,
                      w_, h_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+    API_REGISTER_STATE_CHECK(params_.tst_func(
         dst_tst_ + dst_offset_, dst_stride_, p_src0 + src0_offset_,
         src0_stride_, p_src1 + src1_offset_, src1_stride_, mask_, w_, h_));
   }
@@ -232,7 +232,7 @@
                      CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
                      CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
                      mask_, w_, h_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+    API_REGISTER_STATE_CHECK(params_.tst_func(
         CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
         CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
         CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_, mask_, w_, h_,

diff --git a/test/block_test.cc b/test/block_test.cc
index 9cf5b02..74deee3 100644
--- a/test/block_test.cc
+++ b/test/block_test.cc

@@ -191,9 +191,17 @@
       << "Failed for SB size " << superblock_size_;
 }
 
+const ::libaom_test::TestMode kTestModes[] = {
+#if CONFIG_REALTIME_ONLY
+  ::libaom_test::kRealTime
+#else
+  ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+  ::libaom_test::kTwoPassGood
+#endif
+};
+
 AV1_INSTANTIATE_TEST_SUITE(SuperBlockSizeTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModes),
                            ::testing::Values(AOM_SUPERBLOCK_SIZE_64X64,
                                              AOM_SUPERBLOCK_SIZE_128X128),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));

diff --git a/test/cdef_test.cc b/test/cdef_test.cc
index 14fa12b..a0e3bcb 100644
--- a/test/cdef_test.cc
+++ b/test/cdef_test.cc

@@ -21,7 +21,6 @@
 #include "aom_ports/aom_timer.h"
 #include "av1/common/cdef_block.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -44,7 +43,7 @@
     depth = GET_PARAM(4);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   int bsize;
@@ -123,7 +122,7 @@
                   // If cdef and ref_cdef are the same, we're just testing
                   // speed
                   if (cdef != ref_cdef)
-                    ASM_REGISTER_STATE_CHECK(
+                    API_REGISTER_STATE_CHECK(
                         cdef(depth == 8 ? (uint8_t *)d : 0, d, size,
                              s + CDEF_HBORDER + CDEF_VBORDER * CDEF_BSTRIDE,
                              pristrength, secstrength, dir, pridamping,
@@ -198,7 +197,7 @@
     ref_finddir = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   find_dir_t finddir;
@@ -233,7 +232,7 @@
           for (int c = 0; c < 1 + 9 * (finddir == ref_finddir); c++)
             ref_res = ref_finddir(s, size, &ref_var, depth - 8);
           if (finddir != ref_finddir)
-            ASM_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
+            API_REGISTER_STATE_CHECK(res = finddir(s, size, &var, depth - 8));
           if (ref_finddir != finddir) {
             if (res != ref_res || var != ref_var) error = 1;
             errdepth = depth;

diff --git a/test/clear_system_state.h b/test/clear_system_state.h
deleted file mode 100644
index d38ff5d..0000000
--- a/test/clear_system_state.h
+++ /dev/null

@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_TEST_CLEAR_SYSTEM_STATE_H_
-#define AOM_TEST_CLEAR_SYSTEM_STATE_H_
-
-#include "config/aom_config.h"
-
-#if ARCH_X86 || ARCH_X86_64
-#include "aom_ports/x86.h"
-#endif
-
-namespace libaom_test {
-
-// Reset system to a known state. This function should be used for all non-API
-// test cases.
-inline void ClearSystemState() {
-#if ARCH_X86 || ARCH_X86_64
-  aom_reset_mmx_state();
-#endif
-}
-
-}  // namespace libaom_test
-#endif  // AOM_TEST_CLEAR_SYSTEM_STATE_H_

diff --git a/test/coding_path_sync.cc b/test/coding_path_sync.cc
index 4c613dc..0eaa9da 100644
--- a/test/coding_path_sync.cc
+++ b/test/coding_path_sync.cc

@@ -31,7 +31,11 @@
     aom_codec_iface_t *algo = aom_codec_av1_cx();
 
     aom_codec_enc_cfg_t cfg;
+#if CONFIG_REALTIME_ONLY
+    aom_codec_enc_config_default(algo, &cfg, 1);
+#else
     aom_codec_enc_config_default(algo, &cfg, 0);
+#endif
 
     // force the quantizer, to reduce the sensitivity on encoding choices.
     // e.g, we don't want this test to break when the rate control is modified.

diff --git a/test/comp_avg_pred_test.h b/test/comp_avg_pred_test.h
index 7f73312..312f3d4 100644
--- a/test/comp_avg_pred_test.h
+++ b/test/comp_avg_pred_test.h

@@ -15,11 +15,11 @@
 #include <tuple>
 
 #include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "av1/common/common_data.h"
 #include "aom_ports/aom_timer.h"
@@ -92,7 +92,6 @@
  public:
   ~AV1DISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
@@ -117,8 +116,8 @@
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -160,8 +159,8 @@
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -196,7 +195,6 @@
  public:
   ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
@@ -226,10 +224,9 @@
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -282,8 +279,8 @@
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
@@ -326,8 +323,6 @@
   ~AV1HighBDDISTWTDCOMPAVGTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-  void TearDown() { libaom_test::ClearSystemState(); }
-
  protected:
   void RunCheckOutput(distwtdcompavg_func test_impl) {
     const int w = kMaxSize, h = kMaxSize;
@@ -351,8 +346,8 @@
 
     for (int ii = 0; ii < 2; ii++) {
       for (int jj = 0; jj < 4; jj++) {
-        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+        dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+        dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
 
         const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
         const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -398,8 +393,8 @@
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
 
     const int num_loops = 1000000000 / (in_w + in_h);
     aom_usec_timer timer;
@@ -436,7 +431,6 @@
  public:
   ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
-  void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
   void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
@@ -466,10 +460,9 @@
         for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
           for (int ii = 0; ii < 2; ii++) {
             for (int jj = 0; jj < 4; jj++) {
-              dist_wtd_comp_params.fwd_offset =
-                  quant_dist_lookup_table[ii][jj][0];
+              dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[jj][ii];
               dist_wtd_comp_params.bck_offset =
-                  quant_dist_lookup_table[ii][jj][1];
+                  quant_dist_lookup_table[jj][1 - ii];
 
               const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
               const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
@@ -524,8 +517,8 @@
     DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
     dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
 
-    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
-    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+    dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0];
+    dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][1];
     int sub_x_q3 = 0;
     int sub_y_q3 = 0;
     const int num_loops = 1000000000 / (in_w + in_h);

diff --git a/test/comp_mask_variance_test.cc b/test/comp_mask_variance_test.cc
index b8ee74c..6b18194 100644
--- a/test/comp_mask_variance_test.cc
+++ b/test/comp_mask_variance_test.cc

@@ -23,8 +23,8 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -102,7 +102,6 @@
   aom_free(comp_pred2_);
   aom_free(pred_);
   aom_free(ref_buffer_);
-  libaom_test::ClearSystemState();
 }
 
 void AV1CompMaskVarianceTest::RunCheckOutput(comp_mask_pred_func test_impl,
@@ -344,7 +343,6 @@
   aom_free(comp_pred2_);
   aom_free(pred_);
   aom_free(ref_buffer_);
-  libaom_test::ClearSystemState();
 }
 
 void AV1HighbdCompMaskVarianceTest::RunCheckOutput(

diff --git a/test/convolve_round_test.cc b/test/convolve_round_test.cc
index 4f17b54..ec9fd6d 100644
--- a/test/convolve_round_test.cc
+++ b/test/convolve_round_test.cc

@@ -17,7 +17,6 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/aom_timer.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -114,7 +113,7 @@
       GenerateBufferWithRandom(src_, src_stride, bits, w, h);
 
       func_ref_(src_, src_stride, dst_ref, dst_stride, w, h, bits);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           func_(src_, src_stride, dst, dst_stride, w, h, bits));
 
       if (data_path_ == LOWBITDEPTH_TEST) {

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f4f8f39..097a826 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -24,7 +24,6 @@
 #include "aom_ports/mem.h"
 #include "av1/common/filter.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -290,7 +289,7 @@
         aom_memalign(kDataAlignment, (kOutputBufferSize) * sizeof(uint16_t)));
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
   static void TearDownTestSuite() {
     aom_free(input_ - 1);
@@ -536,11 +535,11 @@
           if (filter_x && filter_y)
             continue;
           else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(
+            API_REGISTER_STATE_CHECK(
                 UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
                           16, filters[filter_y], 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+            API_REGISTER_STATE_CHECK(UUT_->h8_(
                 in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                 kInvalidFilter, 16, Width(), Height()));
           else
@@ -618,11 +617,11 @@
               if (filter_x && filter_y)
                 continue;
               else if (filter_y)
-                ASM_REGISTER_STATE_CHECK(UUT_->v8_(
+                API_REGISTER_STATE_CHECK(UUT_->v8_(
                     in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
                     filters[filter_y], 16, Width(), Height()));
               else if (filter_x)
-                ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+                API_REGISTER_STATE_CHECK(UUT_->h8_(
                     in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                     kInvalidFilter, 16, Width(), Height()));
               else
@@ -688,11 +687,11 @@
         for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
           if (filter_x && filter_y) continue;
           if (filter_y)
-            ASM_REGISTER_STATE_CHECK(
+            API_REGISTER_STATE_CHECK(
                 UUT_->v8_(in, kInputStride, out, kOutputStride, kInvalidFilter,
                           16, filters[filter_y], 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_(
+            API_REGISTER_STATE_CHECK(UUT_->h8_(
                 in, kInputStride, out, kOutputStride, filters[filter_x], 16,
                 kInvalidFilter, 16, Width(), Height()));
         }

diff --git a/test/corner_match_test.cc b/test/corner_match_test.cc
index 9c3a2b9..dc82055 100644
--- a/test/corner_match_test.cc
+++ b/test/corner_match_test.cc

@@ -15,7 +15,6 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
 #include "av1/encoder/corner_match.h"
@@ -54,7 +53,7 @@
   rnd_.Reset(ACMRandom::DeterministicSeed());
   target_func = GET_PARAM(1);
 }
-void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1CornerMatchTest::TearDown() {}
 
 void AV1CornerMatchTest::RunCheckOutput(int run_times) {
   const int w = 128, h = 128;

diff --git a/test/cpu_used_firstpass_test.cc b/test/cpu_used_firstpass_test.cc
new file mode 100644
index 0000000..c970c19
--- /dev/null
+++ b/test/cpu_used_firstpass_test.cc

@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+const double kPsnrDiffThreshold = 0.1;
+const int kFirstPassCpuUsed[] = { 2, 4, 6 };
+
+class CpuUsedFirstpassTest : public ::libaom_test::CodecTestWithParam<int>,
+                             public ::libaom_test::EncoderTest {
+ protected:
+  CpuUsedFirstpassTest()
+      : EncoderTest(GET_PARAM(0)), second_pass_cpu_used_(GET_PARAM(1)) {}
+  virtual ~CpuUsedFirstpassTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig(::libaom_test::kTwoPassGood);
+    const aom_rational timebase = { 1, 30 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.rc_target_bitrate = 1000;
+    cfg_.g_lag_in_frames = 19;
+    cfg_.g_threads = 0;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+  }
+
+  virtual void BeginPassHook(unsigned int pass) {
+    psnr_ = 0.0;
+    nframes_ = 0;
+
+    if (pass == 0)
+      cpu_used_ = first_pass_cpu_used_;
+    else
+      cpu_used_ = second_pass_cpu_used_;
+  }
+
+  virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+      encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrDiffThreshold() { return kPsnrDiffThreshold; }
+
+  void DoTest() {
+    libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480,
+                                       cfg_.g_timebase.den, cfg_.g_timebase.num,
+                                       0, 30);
+    const int size = sizeof(kFirstPassCpuUsed) / sizeof(kFirstPassCpuUsed[0]);
+    double ref_psnr;
+    double psnr_diff;
+
+    first_pass_cpu_used_ = second_pass_cpu_used_;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));  // same preset case ref_psnr
+    ref_psnr = GetAveragePsnr();
+
+    for (int i = 0; i < size; i++) {
+      first_pass_cpu_used_ = kFirstPassCpuUsed[i];
+      if (first_pass_cpu_used_ == second_pass_cpu_used_) continue;
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      psnr_diff = abs(ref_psnr - GetAveragePsnr());
+      EXPECT_LT(psnr_diff, GetPsnrDiffThreshold())
+          << "first pass cpu used = " << first_pass_cpu_used_
+          << ", second pass cpu used = " << second_pass_cpu_used_;
+    }
+  }
+
+  int cpu_used_;
+  int first_pass_cpu_used_;
+  int second_pass_cpu_used_;
+  unsigned int nframes_;
+  double psnr_;
+};
+
+TEST_P(CpuUsedFirstpassTest, FirstPassTest) { DoTest(); }
+
+class CpuUsedFirstpassTestLarge : public CpuUsedFirstpassTest {};
+
+TEST_P(CpuUsedFirstpassTestLarge, FirstPassTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTestLarge,
+                           ::testing::Values(2));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(CpuUsedFirstpassTest,
+                           ::testing::Values(4, 6));  // cpu_used
+}  // namespace

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 2ff074f..71f8b0f 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -57,7 +57,9 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.7)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.4)
+    // FIXME(jingning): Lower this test threshold after vbr mode can render
+    // sufficiently accurate bit rate.
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.45)
         << " The datarate for the file is greater than target by too much!";
   }
 

diff --git a/test/datarate_test.h b/test/datarate_test.h
index 0396034..1b0d515 100644
--- a/test/datarate_test.h
+++ b/test/datarate_test.h

@@ -63,6 +63,7 @@
         encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
       }
     }
 

diff --git a/test/decode_scalability_test.cc b/test/decode_scalability_test.cc
index 6dea8bf..4817beb 100644
--- a/test/decode_scalability_test.cc
+++ b/test/decode_scalability_test.cc

@@ -10,7 +10,6 @@
  */
 
 #include <ostream>
-#include <string>
 
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
@@ -63,8 +62,7 @@
   void RunTest() {
     const DecodeParam input = GET_PARAM(1);
     aom_codec_dec_cfg_t cfg = { 1, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
-    const std::string filename = input.filename;
-    libaom_test::IVFVideoSource decode_video(filename);
+    libaom_test::IVFVideoSource decode_video(input.filename);
     decode_video.Init();
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(&decode_video, cfg));

diff --git a/test/divu_small_test.cc b/test/divu_small_test.cc
index f4d0846..496fbc1 100644
--- a/test/divu_small_test.cc
+++ b/test/divu_small_test.cc

@@ -14,7 +14,7 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 #include "test/acm_random.h"
-#include "av1/common/odintrin.h"
+#include "aom_dsp/odintrin.h"
 
 using libaom_test::ACMRandom;
 

diff --git a/test/dr_prediction_test.cc b/test/dr_prediction_test.cc
index 3e09912..2294656 100644
--- a/test/dr_prediction_test.cc
+++ b/test/dr_prediction_test.cc

@@ -22,7 +22,6 @@
 #include "av1/common/pred_common.h"
 #include "av1/common/reconintra.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -199,7 +198,7 @@
     if (params_.tst_fn) {
       aom_usec_timer_start(&timer);
       for (int k = 0; k < kNumTests; ++k) {
-        ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
+        API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
                                                 above_, left_, upsample_above_,
                                                 upsample_left_, dx_, dy_, bd_));
       }

diff --git a/test/dump_obu.sh b/test/dump_obu.sh
index 7dcab94..933db64 100755
--- a/test/dump_obu.sh
+++ b/test/dump_obu.sh

@@ -45,14 +45,21 @@
 encode_test_file() {
   if [ "$(aomenc_available)" = "yes" ]; then
     local encoder="$(aom_tool_path aomenc)"
-
-    eval "${encoder}" \
-      $(aomenc_encode_test_fast_params) \
-      $(yuv_raw_input) \
-      --ivf \
-      --output=${dump_obu_test_file} \
-      ${devnull} || return 1
-
+    if [ "$(realtime_only_build)" = "yes" ]; then
+      eval "${encoder}" \
+        $(aomenc_encode_test_rt_params) \
+        $(yuv_raw_input) \
+        --ivf \
+        --output=${dump_obu_test_file} \
+        ${devnull} || return 1
+    else
+      eval "${encoder}" \
+        $(aomenc_encode_test_fast_params) \
+        $(yuv_raw_input) \
+        --ivf \
+        --output=${dump_obu_test_file} \
+        ${devnull} || return 1
+    fi
     if [ ! -e "${dump_obu_test_file}" ]; then
       elog "dump_obu test input encode failed."
       return 1

diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index eb91846..70b0612 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc

@@ -20,6 +20,12 @@
 
 namespace {
 
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
+
 TEST(EncodeAPI, InvalidParams) {
   uint8_t buf[1] = { 0 };
   aom_image_t img;
@@ -45,7 +51,7 @@
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, NULL, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
             aom_codec_enc_config_default(iface, &cfg, 3));
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(NULL, aom_codec_get_global_headers(NULL));
 
@@ -63,13 +69,14 @@
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, iface, &cfg, 0));
   EXPECT_EQ(AOM_CODEC_ERROR, aom_codec_control(&enc, -1, 0));
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_control(&enc, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeAPI, AllIntraMode) {
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
@@ -93,5 +100,6 @@
   cfg.kf_max_dist = 1;
   EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0));
 }
+#endif
 
 }  // namespace

diff --git a/test/encode_small_width_height_test.cc b/test/encode_small_width_height_test.cc
index 9bd9471..9ce4fec 100644
--- a/test/encode_small_width_height_test.cc
+++ b/test/encode_small_width_height_test.cc

@@ -19,12 +19,18 @@
 
 #include "aom/aomcx.h"
 #include "aom/aom_encoder.h"
+#include "config/aom_config.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
 namespace {
 
 // Dummy buffer of zero samples.
 constexpr unsigned char kBuffer[256 * 512 + 2 * 128 * 256] = { 0 };
+#if CONFIG_REALTIME_ONLY
+const int kUsage = 1;
+#else
+const int kUsage = 0;
+#endif
 
 TEST(EncodeSmallWidthHeight, SmallWidthMultiThreaded) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
@@ -38,7 +44,7 @@
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   cfg.g_threads = 2;
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
@@ -50,6 +56,7 @@
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeSmallWidthHeight, SmallWidthMultiThreadedSpeed0) {
   // The image has only one tile and the tile is two AV1 superblocks wide.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
@@ -73,6 +80,7 @@
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
+#endif
 
 TEST(EncodeSmallWidthHeight, SmallHeightMultiThreaded) {
   // The image has only one tile and the tile is one AV1 superblock tall.
@@ -86,7 +94,7 @@
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   cfg.g_threads = 2;
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
@@ -98,6 +106,7 @@
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
 
+#if !CONFIG_REALTIME_ONLY
 TEST(EncodeSmallWidthHeight, SmallHeightMultiThreadedSpeed0) {
   // The image has only one tile and the tile is one AV1 superblock tall.
   // For speed 0, superblock size is 128x128 (see av1_select_sb_size()).
@@ -121,6 +130,7 @@
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_encode(&enc, NULL, 0, 0, 0));
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
 }
+#endif
 
 // A reproducer test for aomedia:3113. The test should complete without any
 // memory errors.
@@ -161,7 +171,7 @@
 
   aom_codec_iface_t *iface = aom_codec_av1_cx();
   aom_codec_enc_cfg_t cfg;
-  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(iface, &cfg, kUsage));
   cfg.g_w = kWidth;
   cfg.g_h = kHeight;
   aom_codec_ctx_t enc;

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 058e08e..4a8801f 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -226,18 +226,18 @@
         encoder->EncodeFrame(video, frame_flags_);
 
         CxDataIterator iter = encoder->GetCxData();
+        bool has_cxdata = false;
 
 #if CONFIG_AV1_DECODER
-        bool has_cxdata = false;
         bool has_dxdata = false;
 #endif
         while (const aom_codec_cx_pkt_t *pkt = iter.Next()) {
           pkt = MutateEncoderOutputHook(pkt);
           again = true;
           switch (pkt->kind) {
-            case AOM_CODEC_CX_FRAME_PKT:
-#if CONFIG_AV1_DECODER
+            case AOM_CODEC_CX_FRAME_PKT:  //
               has_cxdata = true;
+#if CONFIG_AV1_DECODER
               if (decoder.get() != NULL && DoDecode()) {
                 aom_codec_err_t res_dec;
                 if (DoDecodeInvisible()) {
@@ -267,21 +267,27 @@
             default: break;
           }
         }
-#if CONFIG_AV1_DECODER
-        if (has_dxdata && has_cxdata) {
+        if (has_cxdata) {
           const aom_image_t *img_enc = encoder->GetPreviewFrame();
-          DxDataIterator dec_iter = decoder->GetDxData();
-          const aom_image_t *img_dec = dec_iter.Next();
-          if (img_enc && img_dec) {
-            const bool res =
-                compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
-            if (!res) {  // Mismatch
-              MismatchHook(img_enc, img_dec);
-            }
+          if (img_enc) {
+            CalculateFrameLevelSSIM(video->img(), img_enc, cfg_.g_bit_depth,
+                                    cfg_.g_input_bit_depth);
           }
-          if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
-        }
+#if CONFIG_AV1_DECODER
+          if (has_dxdata) {
+            DxDataIterator dec_iter = decoder->GetDxData();
+            const aom_image_t *img_dec = dec_iter.Next();
+            if (img_enc && img_dec) {
+              const bool res =
+                  compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
+              if (!res) {  // Mismatch
+                MismatchHook(img_enc, img_dec);
+              }
+            }
+            if (img_dec) DecompressedFrameHook(*img_dec, video->pts());
+          }
 #endif
+        }
         if (!Continue()) break;
       }  // Loop over spatial layers
     }

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 5da3ac5..8c265d9 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -129,11 +129,21 @@
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct aom_svc_ref_frame_comp_pred *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct aom_svc_params *arg) {
     const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct aom_ext_part_funcs *arg) {
+    const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
+  }
+
 #if CONFIG_AV1_ENCODER
   void Control(int ctrl_id, aom_active_map_t *arg) {
     const aom_codec_err_t res = aom_codec_control(&encoder_, ctrl_id, arg);
@@ -216,6 +226,12 @@
   // Hook to be called on every first pass stats packet.
   virtual void StatsPktHook(const aom_codec_cx_pkt_t * /*pkt*/) {}
 
+  // Calculates SSIM at frame level.
+  virtual void CalculateFrameLevelSSIM(const aom_image_t * /*img_src*/,
+                                       const aom_image_t * /*img_enc*/,
+                                       aom_bit_depth_t /*bit_depth*/,
+                                       unsigned int /*input_bit_depth*/) {}
+
   // Hook to determine whether the encode loop should continue.
   virtual bool Continue() const {
     return !(::testing::Test::HasFatalFailure() || abort_);

diff --git a/test/encodemb_test.cc b/test/encodemb_test.cc
new file mode 100644
index 0000000..4c725c7
--- /dev/null
+++ b/test/encodemb_test.cc

@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+// Reorders 'qcoeff_lexico', which is in lexicographic order (row by row), into
+// scan order (zigzag) in 'qcoeff_scan'.
+void ToScanOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_lexico,
+                 tran_low_t *qcoeff_scan) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_scan[i] = qcoeff_lexico[scan_order->scan[i]];
+  }
+}
+
+// Reorders 'qcoeff_scan', which is in scan order (zigzag), into lexicographic
+// order (row by row) in 'qcoeff_lexico'.
+void ToLexicoOrder(TX_SIZE tx_size, TX_TYPE tx_type, tran_low_t *qcoeff_scan,
+                   tran_low_t *qcoeff_lexico) {
+  const int max_eob = av1_get_max_eob(tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+  for (int i = 0; i < max_eob; ++i) {
+    qcoeff_lexico[scan_order->scan[i]] = qcoeff_scan[i];
+  }
+}
+
+// Runs coefficient dropout on 'qcoeff_scan'.
+void Dropout(TX_SIZE tx_size, TX_TYPE tx_type, int dropout_num_before,
+             int dropout_num_after, tran_low_t *qcoeff_scan) {
+  tran_low_t qcoeff[MAX_TX_SQUARE];
+  // qcoeff_scan is assumed to be in scan order, since tests are easier to
+  // understand this way, but av1_dropout_qcoeff expects coeffs in lexico order
+  // so we convert to lexico then back to scan afterwards.
+  ToLexicoOrder(tx_size, tx_type, qcoeff_scan, qcoeff);
+
+  const int max_eob = av1_get_max_eob(tx_size);
+  const int kDequantFactor = 10;
+  tran_low_t dqcoeff[MAX_TX_SQUARE];
+  for (int i = 0; i < max_eob; ++i) {
+    dqcoeff[i] = qcoeff[i] * kDequantFactor;
+  }
+
+  uint16_t eob = max_eob;
+  while (eob > 0 && qcoeff_scan[eob - 1] == 0) --eob;
+
+  MACROBLOCK mb;
+  const int kPlane = 0;
+  const int kBlock = 0;
+  memset(&mb, 0, sizeof(mb));
+  uint16_t eobs[] = { eob };
+  mb.plane[kPlane].eobs = eobs;
+  mb.plane[kPlane].qcoeff = qcoeff;
+  mb.plane[kPlane].dqcoeff = dqcoeff;
+  uint8_t txb_entropy_ctx[1];
+  mb.plane[kPlane].txb_entropy_ctx = txb_entropy_ctx;
+
+  av1_dropout_qcoeff_num(&mb, kPlane, kBlock, tx_size, tx_type,
+                         dropout_num_before, dropout_num_after);
+
+  ToScanOrder(tx_size, tx_type, qcoeff, qcoeff_scan);
+
+  // Check updated eob value is valid.
+  uint16_t new_eob = max_eob;
+  while (new_eob > 0 && qcoeff_scan[new_eob - 1] == 0) --new_eob;
+  EXPECT_EQ(new_eob, mb.plane[kPlane].eobs[0]);
+
+  // Check qqcoeff is still valid.
+  for (int i = 0; i < max_eob; ++i) {
+    EXPECT_EQ(qcoeff[i] * kDequantFactor, dqcoeff[i]);
+  }
+}
+
+void ExpectArrayEq(tran_low_t *actual, std::vector<tran_low_t> expected) {
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], actual[i]) << "Arrays differ at index " << i;
+  }
+}
+
+static constexpr TX_TYPE kTxType = DCT_DCT;
+
+TEST(DropoutTest, KeepsLargeCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large isolated coeffs should be preserved.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 0, 0, 42, 0,    // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  // should be kept
+                               0, 0, 0, 0, 0, 0, 0,  0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 42, 0,    //
+                               0, 0, 0, 0, 0, 0, 0,  0,    //
+                               0, 0, 0, 0, 0, 0, 0,  -30,  //
+                               0, 0, 0, 0, 0, 0, 0,  0 });
+}
+
+TEST(DropoutTest, RemovesSmallIsolatedCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small isolated coeffs should be removed.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1,  0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0,  //
+                               0, 0, 0, 0, -2, 0, 0, 0,  // should be removed
+                               0, 0, 0, 0, 0,  0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsAmongLargeOnes) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are not isolated (not enough zeros before/after should be
+  // kept).
+  tran_low_t qcoeff_scan[] = {
+    1, 0,  0, 0,  -5, 0, 0, -1,  // should be kept
+    0, 0,  0, 10, 0,  0, 2, 0,   // should be kept
+    0, 0,  0, 0,  0,  0, 0, 0,   //
+    0, -2, 0, 0,  0,  0, 0, 0    // should be removed
+  };                             // should be removed
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 0, 0, 0,  -5, 0, 0, -1,  //
+                               0, 0, 0, 10, 0,  0, 2, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0,   //
+                               0, 0, 0, 0,  0,  0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsSmallCoeffsCloseToStartOrEnd) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small coeffs that are too close to the beginning or end of the block
+  // should also be kept (not enough zeroes before/after).
+  tran_low_t qcoeff_scan[] = { 0, 0, -1, 0,  0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  10, 0, 0, 0,  0,  // should be kept
+                               0, 0, 0,  2,  0, 0, 0,  0,  // should be removed
+                               0, 0, 0,  0,  0, 0, -1, 0 };  // should be kept
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, -1, 0,  0, 0, 0,  0,  //
+                               0, 0, 0,  10, 0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, 0,  0,  //
+                               0, 0, 0,  0,  0, 0, -1, 0 });
+}
+
+TEST(DropoutTest, RemovesSmallClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Small clusters (<= kDropoutContinuityMax) of small coeffs should be
+  // removed.
+  tran_low_t qcoeff_scan_two[] = {
+    0, 0, 0, 0, 1, 0, 0, -1,  // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0,   //
+    0, 0, 0, 0, 0, 0, 1, 0,   // should be removed
+    0, 0, 0, 0, 0, 0, 0, 0
+  };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after,
+          qcoeff_scan_two);
+  ExpectArrayEq(qcoeff_scan_two, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0,  //
+                                   0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, KeepsLargeClusterOfCoeffs) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 6;
+  // Large clusters (> kDropoutContinuityMax) of small coeffs should be kept.
+  tran_low_t qcoeff_scan[] = { 0, 0, 0, 0, 1, 0,  1, -1,  // should be kept
+                               0, 0, 0, 0, 0, 0,  0, 0,   //
+                               0, 0, 0, 0, 0, -2, 0, 0,   // should be removed
+                               0, 0, 0, 0, 0, 0,  0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 1, 0, 1, -1,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0,   //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+TEST(DropoutTest, NumBeforeLargerThanNumAfter) {
+  const TX_SIZE tx_size = TX_8X4;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  // The second coeff (-2) doesn't seem to meet the dropout_num_before
+  // criteria. But since the first coeff (1) will be dropped, it will meet
+  // the criteria and should be dropped too.
+  tran_low_t qcoeff_scan[] = { 0,  0, 0, 0, 1, 0, 0, 0,  // should be removed
+                               -2, 0, 0, 0, 0, 0, 0, 0,  // should be removed
+                               0,  0, 0, 0, 0, 0, 0, 0,  //
+                               0,  0, 0, 0, 0, 0, 0, 0 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0,  //
+                               0, 0, 0, 0, 0, 0, 0, 0 });
+}
+
+// More complex test combining other test cases.
+TEST(DropoutTest, ComplexTest) {
+  const TX_SIZE tx_size = TX_8X8;
+  const uint32_t dropout_num_before = 4;
+  const uint32_t dropout_num_after = 2;
+  tran_low_t qcoeff_scan[] = { 1, 12, 0,  0,   0, 0, 1,  0,   //
+                               0, 0,  0,  -12, 0, 0, 0,  1,   //
+                               0, 0,  -2, 0,   1, 0, 0,  1,   //
+                               0, 0,  0,  0,   5, 0, -1, 0,   //
+                               0, 0,  0,  1,   0, 0, 0,  -1,  //
+                               0, 0,  0,  0,   2, 0, 0,  0,   //
+                               0, 1,  0,  0,   0, 5, 0,  0,   //
+                               0, 0,  1,  1,   0, 0, 0,  -2 };
+  Dropout(tx_size, kTxType, dropout_num_before, dropout_num_after, qcoeff_scan);
+  ExpectArrayEq(qcoeff_scan, { 1, 12, 0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  -12, 0, 0, 0,  1,  //
+                               0, 0,  -2, 0,   1, 0, 0,  1,  //
+                               0, 0,  0,  0,   5, 0, -1, 0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  0,  //
+                               0, 0,  0,  0,   0, 5, 0,  0,  //
+                               0, 0,  0,  0,   0, 0, 0,  -2 });
+}
+
+}  // namespace

diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc
index 61838ca..ecb9f73 100644
--- a/test/encodetxb_test.cc
+++ b/test/encodetxb_test.cc

@@ -26,7 +26,6 @@
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -57,7 +56,6 @@
   virtual void TearDown() {
     aom_free(coeff_contexts_ref_);
     aom_free(coeff_contexts_);
-    libaom_test::ClearSystemState();
   }
 
   void GetNzMapContextsRun() {
@@ -214,7 +212,7 @@
     : public ::testing::TestWithParam<TxbInitLevelParam> {
  public:
   virtual ~EncodeTxbInitLevelTest() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed);
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(EncodeTxbInitLevelTest);

diff --git a/test/end_to_end_test.cc b/test/end_to_end_psnr_test.cc
similarity index 78%
rename from test/end_to_end_test.cc
rename to test/end_to_end_psnr_test.cc
index 4c224a8..5574c1a 100644
--- a/test/end_to_end_test.cc
+++ b/test/end_to_end_psnr_test.cc

@@ -27,23 +27,14 @@
 const unsigned int kFramerate = 50;
 const unsigned int kFrames = 10;
 const int kBitrate = 500;
-// List of psnr thresholds for speed settings 0-7 and 5 encoding modes
-const double kPsnrThreshold[][5] = {
-// Note:
-// AV1 HBD average PSNR is slightly lower than AV1.
-// We make two cases here to enable the testing and
-// guard picture quality.
-#if CONFIG_AV1_ENCODER
-  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
-  { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
-  { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
-#else
-  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
-  { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
-  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
-#endif  // CONFIG_AV1_ENCODER
+const unsigned int kCqLevel = 18;
+// List of psnr thresholds for speed settings 0-8 and 4 encoding modes
+const double kPsnrThreshold[][4] = {
+  { 35.7, 44.4, 39.5, 41.9 }, { 35.7, 44.4, 39.5, 41.9 },
+  { 35.7, 44.4, 39.4, 41.9 }, { 35.7, 44.4, 39.1, 41.8 },
+  { 35.6, 44.4, 39.1, 41.8 }, { 35.0, 44.3, 38.7, 41.8 },
+  { 35.0, 44.3, 38.7, 41.3 }, { 35.0, 44.3, 38.7, 40.8 },
+  { 35.0, 44.3, 38.7, 40.8 }
 };
 
 typedef struct {
@@ -107,9 +98,10 @@
 
   virtual void SetUp() {
     InitializeConfig(encoding_mode_);
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
       cfg_.g_lag_in_frames = 5;
-    } else {
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
       cfg_.rc_buf_sz = 1000;
       cfg_.rc_buf_initial_sz = 500;
       cfg_.rc_buf_optimal_sz = 600;
@@ -137,10 +129,13 @@
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
       else
         encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
       }
     }
   }
@@ -191,17 +186,35 @@
 
 class EndToEndTestLarge : public EndToEndTest {};
 
+class EndToEndAllIntraTestLarge : public EndToEndTest {};
+
+class EndToEndAllIntraTest : public EndToEndTest {};
+
 TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
 
 TEST_P(EndToEndTest, EndtoEndPSNRTest) { DoTest(); }
 
+TEST_P(EndToEndAllIntraTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(EndToEndAllIntraTest, EndtoEndPSNRTest) { DoTest(); }
+
 AV1_INSTANTIATE_TEST_SUITE(EndToEndTestLarge,
                            ::testing::ValuesIn(kEncodingModeVectors),
                            ::testing::ValuesIn(kTestVectors),
                            ::testing::ValuesIn(kCpuUsedVectors));
 
 AV1_INSTANTIATE_TEST_SUITE(EndToEndTest,
-                           ::testing::Values(kEncodingModeVectors[0]),
+                           ::testing::Values(::libaom_test::kTwoPassGood),
                            ::testing::Values(kTestVectors[2]),  // 444
-                           ::testing::Values(kCpuUsedVectors[2]));
+                           ::testing::Values(3));               // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
 }  // namespace

diff --git a/test/end_to_end_ssim_test.cc b/test/end_to_end_ssim_test.cc
new file mode 100644
index 0000000..1e638d7
--- /dev/null
+++ b/test/end_to_end_ssim_test.cc

@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/ssim.h"
+#include "av1/common/blockd.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const unsigned int kCqLevel = 18;
+// List of ssim thresholds for speed settings 0-8 with all intra encoding mode.
+const double kSsimThreshold[] = { 83.4, 83.4, 83.4, 83.3, 83.3,
+                                  83.0, 82.3, 81.1, 81.1 };
+
+typedef struct {
+  const char *filename;
+  unsigned int input_bit_depth;
+  aom_img_fmt fmt;
+  aom_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+  return os << "TestVideoParam { filename:" << test_arg.filename
+            << " input_bit_depth:" << test_arg.input_bit_depth
+            << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+            << " profile:" << test_arg.profile << " }";
+}
+
+const TestVideoParam kTestVectors[] = {
+  { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+  { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
+  { "park_joy_90p_8_444.y4m", 8, AOM_IMG_FMT_I444, AOM_BITS_8, 1 },
+#if CONFIG_AV1_HIGHBITDEPTH
+  { "park_joy_90p_10_420.y4m", 10, AOM_IMG_FMT_I42016, AOM_BITS_10, 0 },
+  { "park_joy_90p_10_422.y4m", 10, AOM_IMG_FMT_I42216, AOM_BITS_10, 2 },
+  { "park_joy_90p_10_444.y4m", 10, AOM_IMG_FMT_I44416, AOM_BITS_10, 1 },
+  { "park_joy_90p_12_420.y4m", 12, AOM_IMG_FMT_I42016, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_422.y4m", 12, AOM_IMG_FMT_I42216, AOM_BITS_12, 2 },
+  { "park_joy_90p_12_444.y4m", 12, AOM_IMG_FMT_I44416, AOM_BITS_12, 2 },
+#endif
+};
+
+// This class is used to check adherence to given ssim value.
+class EndToEndSSIMTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 TestVideoParam, int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  EndToEndSSIMTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        test_video_param_(GET_PARAM(2)), cpu_used_(GET_PARAM(3)), nframes_(0),
+        ssim_(0.0) {}
+
+  ~EndToEndSSIMTest() override {}
+
+  void SetUp() override { InitializeConfig(encoding_mode_); }
+
+  void BeginPassHook(unsigned int) override {
+    nframes_ = 0;
+    ssim_ = 0.0;
+  }
+
+  void CalculateFrameLevelSSIM(const aom_image_t *img_src,
+                               const aom_image_t *img_enc,
+                               aom_bit_depth_t bit_depth,
+                               unsigned int input_bit_depth) override {
+    double frame_ssim;
+    double plane_ssim[MAX_MB_PLANE] = { 0.0, 0.0, 0.0 };
+    int crop_widths[PLANE_TYPES];
+    int crop_heights[PLANE_TYPES];
+    crop_widths[PLANE_TYPE_Y] = img_src->d_w;
+    crop_heights[PLANE_TYPE_Y] = img_src->d_h;
+    // Width of UV planes calculated based on chroma_shift values.
+    crop_widths[PLANE_TYPE_UV] =
+        img_src->x_chroma_shift == 1 ? (img_src->w + 1) >> 1 : img_src->w;
+    crop_heights[PLANE_TYPE_UV] =
+        img_src->y_chroma_shift == 1 ? (img_src->h + 1) >> 1 : img_src->h;
+    nframes_++;
+
+#if CONFIG_AV1_HIGHBITDEPTH
+    uint8_t is_hbd = bit_depth > AOM_BITS_8;
+    if (is_hbd) {
+      // HBD ssim calculation.
+      uint8_t shift = bit_depth - input_bit_depth;
+      for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+        const int is_uv = i > AOM_PLANE_Y;
+        plane_ssim[i] = aom_highbd_ssim2(
+            CONVERT_TO_BYTEPTR(img_src->planes[i]),
+            CONVERT_TO_BYTEPTR(img_enc->planes[i]),
+            img_src->stride[is_uv] >> is_hbd, img_enc->stride[is_uv] >> is_hbd,
+            crop_widths[is_uv], crop_heights[is_uv], input_bit_depth, shift);
+      }
+      frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                   .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+      // Accumulate to find sequence level ssim value.
+      ssim_ += frame_ssim;
+      return;
+    }
+#else
+    (void)bit_depth;
+    (void)input_bit_depth;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+    // LBD ssim calculation.
+    for (int i = AOM_PLANE_Y; i < MAX_MB_PLANE; ++i) {
+      const int is_uv = i > AOM_PLANE_Y;
+      plane_ssim[i] = aom_ssim2(img_src->planes[i], img_enc->planes[i],
+                                img_src->stride[is_uv], img_enc->stride[is_uv],
+                                crop_widths[is_uv], crop_heights[is_uv]);
+    }
+    frame_ssim = plane_ssim[AOM_PLANE_Y] * .8 +
+                 .1 * (plane_ssim[AOM_PLANE_U] + plane_ssim[AOM_PLANE_V]);
+    // Accumulate to find sequence level ssim value.
+    ssim_ += frame_ssim;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_TUNING, AOM_TUNE_SSIM);
+      encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+    }
+  }
+
+  double GetAverageSsim() const {
+    if (nframes_) return 100 * pow(ssim_ / nframes_, 8.0);
+    return 0.0;
+  }
+
+  double GetSsimThreshold() { return kSsimThreshold[cpu_used_]; }
+
+  void DoTest() {
+    cfg_.g_profile = test_video_param_.profile;
+    cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+    cfg_.g_bit_depth = test_video_param_.bit_depth;
+    if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                        kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double ssim = GetAverageSsim();
+    EXPECT_GT(ssim, GetSsimThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const TestVideoParam test_video_param_;
+  const int cpu_used_;
+  unsigned int nframes_;
+  double ssim_;
+};
+
+class EndToEndSSIMTestLarge : public EndToEndSSIMTest {};
+
+TEST_P(EndToEndSSIMTestLarge, EndtoEndSSIMTest) { DoTest(); }
+
+TEST_P(EndToEndSSIMTest, EndtoEndSSIMTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::ValuesIn(kTestVectors),
+                           ::testing::Values(2, 4, 6, 8));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(EndToEndSSIMTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(kTestVectors[0]),  // 420
+                           ::testing::Values(6));               // cpu_used
+}  // namespace

diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index ea0acf4..e4befd5 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc

@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/entropy.h"
@@ -32,16 +31,20 @@
 namespace {
 const int kNumIterations = 1000;
 
-typedef int64_t (*ErrorBlockFunc)(const tran_low_t *coeff,
-                                  const tran_low_t *dqcoeff,
-                                  intptr_t block_size, int64_t *ssz, int bps);
+using ErrorBlockFunc = int64_t (*)(const tran_low_t *coeff,
+                                   const tran_low_t *dqcoeff,
+                                   intptr_t block_size, int64_t *ssz, int bps);
 
-typedef int64_t (*ErrorBlockFunc8Bits)(const tran_low_t *coeff,
-                                       const tran_low_t *dqcoeff,
-                                       intptr_t block_size, int64_t *ssz);
+using ErrorBlockFunc8Bits = int64_t (*)(const tran_low_t *coeff,
+                                        const tran_low_t *dqcoeff,
+                                        intptr_t block_size, int64_t *ssz);
 
-typedef std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>
-    ErrorBlockParam;
+using ErrorBlockLpFunc = int64_t (*)(const int16_t *coeff,
+                                     const int16_t *dqcoeff,
+                                     intptr_t block_size);
+
+using ErrorBlockParam =
+    std::tuple<ErrorBlockFunc, ErrorBlockFunc, aom_bit_depth_t>;
 
 template <ErrorBlockFunc8Bits fn>
 int64_t BlockError8BitWrapper(const tran_low_t *coeff,
@@ -51,6 +54,15 @@
   return fn(coeff, dqcoeff, block_size, ssz);
 }
 
+template <ErrorBlockLpFunc fn>
+int64_t BlockErrorLpWrapper(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                            intptr_t block_size, int64_t *ssz, int bps) {
+  EXPECT_EQ(bps, 8);
+  *ssz = -1;
+  return fn(reinterpret_cast<const int16_t *>(coeff),
+            reinterpret_cast<const int16_t *>(dqcoeff), block_size);
+}
+
 class ErrorBlockTest : public ::testing::TestWithParam<ErrorBlockParam> {
  public:
   virtual ~ErrorBlockTest() {}
@@ -60,7 +72,7 @@
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   aom_bit_depth_t bit_depth_;
@@ -99,7 +111,7 @@
     }
     ref_ret =
         ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
     err_count += (ref_ret != ret) | (ref_ssz != ssz);
     if (err_count && !err_count_total) {
@@ -157,7 +169,7 @@
     }
     ref_ret =
         ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         ret = error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_));
     err_count += (ref_ret != ret) | (ref_ssz != ssz);
     if (err_count && !err_count_total) {
@@ -247,7 +259,9 @@
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_sse2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_sse2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(SSE2, ErrorBlockTest,
@@ -265,7 +279,9 @@
              AOM_BITS_8),
 #endif
   make_tuple(&BlockError8BitWrapper<av1_block_error_avx2>,
-             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8)
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_avx2>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
 };
 
 INSTANTIATE_TEST_SUITE_P(AVX2, ErrorBlockTest,
@@ -281,10 +297,14 @@
 #endif  // HAVE_MSA
 
 #if (HAVE_NEON)
-INSTANTIATE_TEST_SUITE_P(
-    NEON, ErrorBlockTest,
-    ::testing::Values(make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
-                                 &BlockError8BitWrapper<av1_block_error_c>,
-                                 AOM_BITS_8)));
+const ErrorBlockParam kErrorBlockTestParamsNeon[] = {
+  make_tuple(&BlockError8BitWrapper<av1_block_error_neon>,
+             &BlockError8BitWrapper<av1_block_error_c>, AOM_BITS_8),
+  make_tuple(&BlockErrorLpWrapper<av1_block_error_lp_neon>,
+             &BlockErrorLpWrapper<av1_block_error_lp_c>, AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest,
+                         ::testing::ValuesIn(kErrorBlockTestParamsNeon));
 #endif  // HAVE_NEON
 }  // namespace

diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index 5bf8762..78811b6 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc

@@ -21,6 +21,9 @@
 #include "av1/encoder/firstpass.h"
 
 namespace {
+const unsigned int kCqLevel = 18;
+
+#if !CONFIG_REALTIME_ONLY
 const size_t kFirstPassStatsSz = sizeof(FIRSTPASS_STATS);
 class AVxFirstPassEncoderThreadTest
     : public ::libaom_test::CodecTestWith4Params<libaom_test::TestMode, int,
@@ -196,6 +199,7 @@
   // Comparison 4 (between threads=4 and threads=8).
   compare_fp_stats_md5(&firstpass_stats);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadTest
     : public ::libaom_test::CodecTestWith5Params<libaom_test::TestMode, int,
@@ -227,11 +231,12 @@
   virtual void SetUp() {
     InitializeConfig(encoding_mode_);
 
-    if (encoding_mode_ != ::libaom_test::kRealTime) {
+    if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+        encoding_mode_ == ::libaom_test::kTwoPassGood) {
       cfg_.g_lag_in_frames = 6;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
-    } else {
+    } else if (encoding_mode_ == ::libaom_test::kRealTime) {
       cfg_.g_error_resilient = 1;
     }
     cfg_.rc_max_quantizer = 56;
@@ -248,18 +253,22 @@
       SetTileSize(encoder);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_ROW_MT, row_mt_);
-      if (encoding_mode_ != ::libaom_test::kRealTime) {
+      if (encoding_mode_ == ::libaom_test::kOnePassGood ||
+          encoding_mode_ == ::libaom_test::kTwoPassGood) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 5);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
         encoder->Control(AV1E_SET_MAX_GF_INTERVAL, 4);
-      } else {
+      } else if (encoding_mode_ == ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
         encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
         encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 3);
+        encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 3);
+      } else {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
       }
       encoder_initialized_ = true;
     }
@@ -423,12 +432,6 @@
   std::vector<std::string> md5_dec_;
 };
 
-TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
-  cfg_.large_scale_tile = 0;
-  decoder_->Control(AV1_SET_TILE_MODE, 0);
-  DoTest();
-}
-
 class AVxEncoderThreadRTTest : public AVxEncoderThreadTest {};
 
 TEST_P(AVxEncoderThreadRTTest, EncoderResultTest) {
@@ -437,6 +440,18 @@
   DoTest();
 }
 
+// For real time mode, test speed 6, 7, 8, 9.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(6, 7, 8, 9),
+                           ::testing::Values(0, 2), ::testing::Values(0, 2),
+                           ::testing::Values(0, 1));
+
+#if !CONFIG_REALTIME_ONLY
+
+// The AVxEncoderThreadTestLarge takes up ~14% of total run-time of the
+// Valgrind long tests. Exclude it; the smaller tests are still run.
+#if !AOM_VALGRIND_BUILD
 class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
 
 TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) {
@@ -445,9 +460,32 @@
   DoTest();
 }
 
-class AVxEncoderThreadRTTestLarge : public AVxEncoderThreadTest {};
+// Test cpu_used 0, 1, 3 and 5.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood,
+                                             ::libaom_test::kOnePassGood),
+                           ::testing::Values(0, 1, 3, 5),
+                           ::testing::Values(1, 6), ::testing::Values(1, 6),
+                           ::testing::Values(0, 1));
+#endif  // !AOM_VALGRIND_BUILD
 
-TEST_P(AVxEncoderThreadRTTestLarge, EncoderResultTest) {
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadAllIntraTest : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTest, EncoderResultTest) {
+  cfg_.large_scale_tile = 0;
+  decoder_->Control(AV1_SET_TILE_MODE, 0);
+  DoTest();
+}
+
+class AVxEncoderThreadAllIntraTestLarge : public AVxEncoderThreadTest {};
+
+TEST_P(AVxEncoderThreadAllIntraTestLarge, EncoderResultTest) {
   cfg_.large_scale_tile = 0;
   decoder_->Control(AV1_SET_TILE_MODE, 0);
   DoTest();
@@ -466,26 +504,20 @@
                            ::testing::Values(2), ::testing::Values(0, 2),
                            ::testing::Values(0, 2), ::testing::Values(0, 1));
 
-// Test cpu_used 7, 8, 9 here.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTest,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Values(7, 8, 9), ::testing::Values(0, 2),
+// For all intra mode, test speed 0, 2, 4, 6, 8.
+// Only test cpu_used 6 here.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6), ::testing::Values(0, 2),
                            ::testing::Values(0, 2), ::testing::Values(0, 1));
 
-// Test cpu_used 0, 1, 3 and 5.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadTestLarge,
-                           ::testing::Values(::libaom_test::kTwoPassGood,
-                                             ::libaom_test::kOnePassGood),
-                           ::testing::Values(0, 1, 3, 5),
+// Test cpu_used 0, 2, 4 and 8.
+AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 2, 4, 8),
                            ::testing::Values(1, 6), ::testing::Values(1, 6),
                            ::testing::Values(0, 1));
-
-// Test cpu_used 0, 2, 4 and 6.
-AV1_INSTANTIATE_TEST_SUITE(AVxEncoderThreadRTTestLarge,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Values(0, 2, 4, 6),
-                           ::testing::Values(1, 6), ::testing::Values(1, 6),
-                           ::testing::Values(0, 1));
+#endif  // !CONFIG_REALTIME_ONLY
 
 class AVxEncoderThreadLSTest : public AVxEncoderThreadTest {
   virtual void SetTileSize(libaom_test::Encoder *encoder) {
@@ -512,6 +544,10 @@
   DoTest();
 }
 
+// AVxEncoderThreadLSTestLarge takes up about 2% of total run-time of
+// the Valgrind long tests. Since we already run AVxEncoderThreadLSTest,
+// skip this one for Valgrind.
+#if !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 class AVxEncoderThreadLSTestLarge : public AVxEncoderThreadLSTest {};
 
 TEST_P(AVxEncoderThreadLSTestLarge, EncoderResultTest) {
@@ -526,4 +562,5 @@
                                              ::libaom_test::kOnePassGood),
                            ::testing::Values(1, 3), ::testing::Values(0, 6),
                            ::testing::Values(0, 6), ::testing::Values(1));
+#endif  // !CONFIG_REALTIME_ONLY && !AOM_VALGRIND_BUILD
 }  // namespace

diff --git a/test/examples.sh b/test/examples.sh
index 2cdb89d..87d8c2b 100755
--- a/test/examples.sh
+++ b/test/examples.sh

@@ -17,6 +17,10 @@
 # List of script names to exclude.
 exclude_list="best_encode examples run_encodes tools_common"
 
+if [ "$(realtime_only_build)" = "yes" ]; then
+  exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test"
+fi
+
 # Filter out the scripts in $exclude_list.
 for word in ${exclude_list}; do
   example_tests=$(filter_strings "${example_tests}" "${word}" exclude)

diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 5006b5b..b060ee3 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc

@@ -199,6 +199,7 @@
 
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 // Class for testing passing in external frame buffers to libaom.
 class ExternalFrameBufferMD5Test
     : public ::libaom_test::DecoderTest,
@@ -298,6 +299,7 @@
   int num_buffers_;
   ExternalFrameBufferList fb_list_;
 };
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_WEBM_IO
 const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
@@ -395,6 +397,7 @@
 };
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 // This test runs through the set of test vectors, and decodes them.
 // Libaom will call into the application to allocate a frame buffer when
 // needed. The md5 checksums are computed for each frame in the video file.
@@ -438,6 +441,7 @@
   // Decode frame, and check the md5 matching.
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_WEBM_IO
 TEST_F(ExternalFrameBufferTest, MinFrameBuffers) {
@@ -447,7 +451,11 @@
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
@@ -459,7 +467,11 @@
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
@@ -470,10 +482,14 @@
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   // Only run this on long clips. Decoding a very short clip will return
   // AOM_CODEC_OK even with only 2 buffers.
   ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NoRelease) {
@@ -481,8 +497,12 @@
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     do_not_release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   ASSERT_EQ(AOM_CODEC_MEM_ERROR, DecodeRemainingFrames());
+#endif
 }
 
 TEST_F(ExternalFrameBufferTest, NullRealloc) {
@@ -515,11 +535,15 @@
 }
 
 TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeOneFrame());
+#else
   const int num_buffers = AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
   ASSERT_EQ(AOM_CODEC_OK, DecodeOneFrame());
   ASSERT_EQ(AOM_CODEC_ERROR,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#endif
 }
 
 TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
@@ -527,14 +551,20 @@
   ASSERT_EQ(AOM_CODEC_OK,
             SetFrameBufferFunctions(num_buffers, get_aom_frame_buffer,
                                     release_aom_frame_buffer));
+#if CONFIG_REALTIME_ONLY
+  ASSERT_EQ(AOM_CODEC_UNSUP_FEATURE, DecodeRemainingFrames());
+#else
   ASSERT_EQ(AOM_CODEC_OK, DecodeRemainingFrames());
+#endif
   CheckFrameBufferRelease();
 }
 #endif  // CONFIG_WEBM_IO
 
+#if !CONFIG_REALTIME_ONLY
 AV1_INSTANTIATE_TEST_SUITE(
     ExternalFrameBufferMD5Test,
     ::testing::ValuesIn(libaom_test::kAV1TestVectors,
                         libaom_test::kAV1TestVectors +
                             libaom_test::kNumAV1TestVectors));
+#endif
 }  // namespace

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 69e4bda..046d810 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/transform_test_base.h"
 #include "test/util.h"
@@ -72,7 +71,7 @@
     TxfmBaseOutType::mask_ = (1 << TxfmBaseOutType::bit_depth_) - 1;
     TxfmBaseOutType::num_coeffs_ = std::get<3>(this->GetParam());
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunFwdTxfm(const int16_t *in, OutputType *out, int stride) {

diff --git a/test/fft_test.cc b/test/fft_test.cc
index d23aa01..0725ca8 100644
--- a/test/fft_test.cc
+++ b/test/fft_test.cc

@@ -126,12 +126,14 @@
 TEST_P(FFT2DTest, Benchmark) {
   int n = GetParam().n;
   float sum = 0;
-  for (int i = 0; i < 1000 * (64 - n); ++i) {
+  const int num_trials = 1000 * (64 - n);
+  for (int i = 0; i < num_trials; ++i) {
     input_[i % (n * n)] = 1;
     GetParam().fft(&input_[0], &temp_[0], &output_[0]);
     sum += output_[0];
     input_[i % (n * n)] = 0;
   }
+  EXPECT_NEAR(sum, num_trials, 1e-3);
 }
 
 INSTANTIATE_TEST_SUITE_P(C, FFT2DTest,
@@ -221,12 +223,14 @@
 TEST_P(IFFT2DTest, Benchmark) {
   int n = GetParam().n;
   float sum = 0;
-  for (int i = 0; i < 1000 * (64 - n); ++i) {
+  const int num_trials = 1000 * (64 - n);
+  for (int i = 0; i < num_trials; ++i) {
     input_[i % (n * n)] = 1;
     GetParam().ifft(&input_[0], &temp_[0], &output_[0]);
     sum += output_[0];
     input_[i % (n * n)] = 0;
   }
+  EXPECT_GE(sum, num_trials / 2);
 }
 INSTANTIATE_TEST_SUITE_P(
     C, IFFT2DTest,

diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
index 14cdd39..795616a 100644
--- a/test/filterintra_test.cc
+++ b/test/filterintra_test.cc

@@ -16,7 +16,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/enums.h"
@@ -59,7 +58,6 @@
     delete[] alloc_;
     delete[] predRef_;
     delete[] pred_;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -71,7 +69,7 @@
     while (tstIndex < MaxTestNum) {
       PrepareBuffer();
       predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
       DiffPred(tstIndex);
       tstIndex += 1;

diff --git a/test/firstpass_test.cc b/test/firstpass_test.cc
new file mode 100644
index 0000000..f7d8f2e
--- /dev/null
+++ b/test/firstpass_test.cc

@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stddef.h>
+
+#include "av1/common/common.h"
+#include "av1/encoder/firstpass.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(FirstpassTest, FirstpassInfoInitWithExtBuf) {
+  FIRSTPASS_INFO firstpass_info;
+  FIRSTPASS_STATS ext_stats_buf[10];
+  const int ref_stats_size = 10;
+  for (int i = 0; i < ref_stats_size; ++i) {
+    av1_zero(ext_stats_buf[i]);
+    ext_stats_buf[i].frame = i;
+  }
+  aom_codec_err_t ret =
+      av1_firstpass_info_init(&firstpass_info, ext_stats_buf, 10);
+  EXPECT_EQ(firstpass_info.stats_count, ref_stats_size);
+  EXPECT_EQ(firstpass_info.future_stats_count + firstpass_info.past_stats_count,
+            firstpass_info.stats_count);
+  EXPECT_EQ(firstpass_info.cur_index, 0);
+  EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoInitWithStaticBuf) {
+  FIRSTPASS_INFO firstpass_info;
+  aom_codec_err_t ret = av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.stats_count, 0);
+  EXPECT_EQ(firstpass_info.cur_index, 0);
+  EXPECT_EQ(ret, AOM_CODEC_OK);
+}
+
+TEST(FirstpassTest, FirstpassInfoPushPop) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = i;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  const int pop_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < pop_count; ++i) {
+    const FIRSTPASS_STATS *stats = av1_firstpass_info_peek(&firstpass_info, 0);
+    aom_codec_err_t ret =
+        av1_firstpass_info_move_cur_index_and_pop(&firstpass_info);
+    EXPECT_NE(stats, nullptr);
+    EXPECT_EQ(stats->frame, i);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count,
+            FIRSTPASS_INFO_STATIC_BUF_SIZE - pop_count);
+
+  const int push_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < push_count; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+  EXPECT_EQ(firstpass_info.stats_count, firstpass_info.stats_buf_size);
+  // Push the stats when the queue is full.
+  FIRSTPASS_STATS stats;
+  av1_zero(stats);
+  aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+  EXPECT_EQ(ret, AOM_CODEC_ERROR);
+}
+
+TEST(FirstpassTest, FirstpassInfoTotalStats) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  EXPECT_EQ(firstpass_info.total_stats.frame, 0);
+  for (int i = 0; i < 10; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.count = 1;
+    av1_firstpass_info_push(&firstpass_info, &stats);
+  }
+  EXPECT_EQ(firstpass_info.total_stats.count, 10);
+}
+
+TEST(FirstpassTest, FirstpassInfoMoveCurr) {
+  FIRSTPASS_INFO firstpass_info;
+  av1_firstpass_info_init(&firstpass_info, NULL, 0);
+  int frame_cnt = 0;
+  EXPECT_EQ(firstpass_info.stats_buf_size, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+  for (int i = 0; i < FIRSTPASS_INFO_STATIC_BUF_SIZE; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = frame_cnt;
+    ++frame_cnt;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+  EXPECT_EQ(firstpass_info.cur_index, firstpass_info.start_index);
+  aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+  // We cannot pop when cur_index == start_index
+  EXPECT_EQ(ret, AOM_CODEC_ERROR);
+  int ref_frame_cnt = 0;
+  const int move_count = FIRSTPASS_INFO_STATIC_BUF_SIZE * 2 / 3;
+  for (int i = 0; i < move_count; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&firstpass_info, 0);
+    EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+    ++ref_frame_cnt;
+    av1_firstpass_info_move_cur_index(&firstpass_info);
+  }
+  EXPECT_EQ(firstpass_info.future_stats_count,
+            FIRSTPASS_INFO_STATIC_BUF_SIZE - move_count);
+  EXPECT_EQ(firstpass_info.past_stats_count, move_count);
+  EXPECT_EQ(firstpass_info.stats_count, FIRSTPASS_INFO_STATIC_BUF_SIZE);
+
+  const int test_count = FIRSTPASS_INFO_STATIC_BUF_SIZE / 2;
+  for (int i = 0; i < test_count; ++i) {
+    aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+
+  // Pop #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    FIRSTPASS_STATS stats;
+    av1_zero(stats);
+    stats.frame = frame_cnt;
+    ++frame_cnt;
+    aom_codec_err_t ret = av1_firstpass_info_push(&firstpass_info, &stats);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+
+  // peek and move #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    const FIRSTPASS_STATS *this_stats =
+        av1_firstpass_info_peek(&firstpass_info, 0);
+    EXPECT_EQ(this_stats->frame, ref_frame_cnt);
+    ++ref_frame_cnt;
+    av1_firstpass_info_move_cur_index(&firstpass_info);
+  }
+
+  // pop #test_count stats
+  for (int i = 0; i < test_count; ++i) {
+    aom_codec_err_t ret = av1_firstpass_info_pop(&firstpass_info);
+    EXPECT_EQ(ret, AOM_CODEC_OK);
+  }
+}
+
+}  // namespace

diff --git a/test/frame_error_test.cc b/test/frame_error_test.cc
index 6478f09..7acda9f 100644
--- a/test/frame_error_test.cc
+++ b/test/frame_error_test.cc

@@ -20,7 +20,6 @@
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 
@@ -44,7 +43,7 @@
   virtual void SetUp() {
     rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RandomValues(frame_error_func test_impl, int width, int height);

diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 38b6a63..2365a20 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc

@@ -73,6 +73,7 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+#if !CONFIG_REALTIME_ONLY
 typedef struct {
   unsigned int width;
   unsigned int height;
@@ -129,5 +130,6 @@
 AV1_INSTANTIATE_TEST_SUITE(AV1LosslessFrameSizeTests,
                            ::testing::ValuesIn(FrameSizeTestParams),
                            testing::Values(::libaom_test::kAllIntra));
+#endif  // !CONFIG_REALTIME_ONLY
 
 }  // namespace

diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
index a299c48..a7116b1 100644
--- a/test/function_equivalence_test.h
+++ b/test/function_equivalence_test.h

@@ -16,7 +16,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/util.h"
 
 using libaom_test::ACMRandom;
@@ -60,7 +59,7 @@
 
   virtual void SetUp() { params_ = this->GetParam(); }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   ACMRandom rng_;

diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index b600d26..2e27adf 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc

@@ -20,7 +20,6 @@
 #include "config/av1_rtcd.h"
 #include "config/aom_dsp_rtcd.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/transform_test_base.h"
 #include "test/util.h"
@@ -45,14 +44,26 @@
   av1_fwht4x4_c(in, out, stride);
 }
 
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+void iwht4x4_10_c(const tran_low_t *in, uint8_t *out, int stride) {
   av1_highbd_iwht4x4_16_add_c(in, out, stride, 10);
 }
 
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+void iwht4x4_12_c(const tran_low_t *in, uint8_t *out, int stride) {
   av1_highbd_iwht4x4_16_add_c(in, out, stride, 12);
 }
 
+#if HAVE_SSE4_1
+
+void iwht4x4_10_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 10);
+}
+
+void iwht4x4_12_sse4_1(const tran_low_t *in, uint8_t *out, int stride) {
+  av1_highbd_iwht4x4_16_add_sse4_1(in, out, stride, 12);
+}
+
+#endif
+
 class Trans4x4WHT : public libaom_test::TransformTestBase<tran_low_t>,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
@@ -69,7 +80,7 @@
     num_coeffs_ = GET_PARAM(4);
     fwd_txfm_c_ = GET_PARAM(5);
   }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
@@ -118,7 +129,7 @@
         aom_usec_timer c_timer_;
         aom_usec_timer_start(&c_timer_);
         for (int i = 0; i < numIter; i++) {
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               fwd_txfm_c_(input_block, output_ref_block, stride));
         }
         aom_usec_timer_mark(&c_timer_);
@@ -127,7 +138,7 @@
         aom_usec_timer_start(&simd_timer_);
 
         for (int i = 0; i < numIter; i++) {
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               fwd_txfm_(input_block, output_block, stride));
         }
         aom_usec_timer_mark(&simd_timer_);
@@ -177,19 +188,35 @@
 
 INSTANTIATE_TEST_SUITE_P(
     C, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10_c, DCT_DCT,
                                  AOM_BITS_10, 16, static_cast<FdctFunc>(NULL)),
-                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
+                      make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12_c, DCT_DCT,
                                  AOM_BITS_12, 16,
                                  static_cast<FdctFunc>(NULL))));
+
+#if HAVE_SSE4_1
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE4_1, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_sse4_1, &iwht4x4_10_sse4_1,
+                                 DCT_DCT, AOM_BITS_10, 16,
+                                 static_cast<FdctFunc>(NULL)),
+                      make_tuple(&av1_highbd_fwht4x4_sse4_1, &iwht4x4_12_sse4_1,
+                                 DCT_DCT, AOM_BITS_12, 16,
+                                 static_cast<FdctFunc>(NULL))));
+
+#endif  // HAVE_SSE4_1
+
 #if HAVE_NEON
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, Trans4x4WHT,
-    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_10, DCT_DCT,
-                                 AOM_BITS_10, 16, &av1_highbd_fwht4x4_c),
-                      make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_12, DCT_DCT,
-                                 AOM_BITS_12, 16, &av1_highbd_fwht4x4_c)));
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_10_c,
+                                 DCT_DCT, AOM_BITS_10, 16,
+                                 &av1_highbd_fwht4x4_c),
+                      make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_12_c,
+                                 DCT_DCT, AOM_BITS_12, 16,
+                                 &av1_highbd_fwht4x4_c)));
 
 #endif  // HAVE_NEON
 

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index 7903259..8813f33 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc

@@ -16,7 +16,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -24,16 +23,20 @@
 
 using libaom_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
-                             tran_low_t *b);
+using HadamardFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                              tran_low_t *b);
+// Low precision version of Hadamard Transform
+using HadamardLPFunc = void (*)(const int16_t *a, ptrdiff_t a_stride,
+                                int16_t *b);
 
-void HadamardLoop(const tran_low_t *a, tran_low_t *out) {
-  tran_low_t b[8];
+template <typename OutputType>
+void HadamardLoop(const OutputType *a, OutputType *out) {
+  OutputType b[8];
   for (int i = 0; i < 8; i += 2) {
     b[i + 0] = a[i * 8] + a[(i + 1) * 8];
     b[i + 1] = a[i * 8] - a[(i + 1) * 8];
   }
-  tran_low_t c[8];
+  OutputType c[8];
   for (int i = 0; i < 8; i += 4) {
     c[i + 0] = b[i + 0] + b[i + 2];
     c[i + 1] = b[i + 1] + b[i + 3];
@@ -50,19 +53,21 @@
   out[5] = c[3] - c[7];
 }
 
-void ReferenceHadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
-  tran_low_t input[64];
-  tran_low_t buf[64];
+template <typename OutputType>
+void ReferenceHadamard8x8(const int16_t *a, int a_stride, OutputType *b) {
+  OutputType input[64];
+  OutputType buf[64];
   for (int i = 0; i < 8; ++i) {
     for (int j = 0; j < 8; ++j) {
-      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+      input[i * 8 + j] = static_cast<OutputType>(a[i * a_stride + j]);
     }
   }
   for (int i = 0; i < 8; ++i) HadamardLoop(input + i, buf + i * 8);
   for (int i = 0; i < 8; ++i) HadamardLoop(buf + i, b + i * 8);
 }
 
-void ReferenceHadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard16x16(const int16_t *a, int a_stride, OutputType *b) {
   /* The source is a 16x16 block. The destination is rearranged to 8x32.
    * Input is 9 bit. */
   ReferenceHadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
@@ -73,16 +78,16 @@
   /* Overlay the 8x8 blocks and combine. */
   for (int i = 0; i < 64; ++i) {
     /* 8x8 steps the range up to 15 bits. */
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[64];
-    const tran_low_t a2 = b[128];
-    const tran_low_t a3 = b[192];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[64];
+    const OutputType a2 = b[128];
+    const OutputType a3 = b[192];
 
     /* Prevent the result from escaping int16_t. */
-    const tran_low_t b0 = (a0 + a1) >> 1;
-    const tran_low_t b1 = (a0 - a1) >> 1;
-    const tran_low_t b2 = (a2 + a3) >> 1;
-    const tran_low_t b3 = (a2 - a3) >> 1;
+    const OutputType b0 = (a0 + a1) >> 1;
+    const OutputType b1 = (a0 - a1) >> 1;
+    const OutputType b2 = (a2 + a3) >> 1;
+    const OutputType b3 = (a2 - a3) >> 1;
 
     /* Store a 16 bit value. */
     b[0] = b0 + b2;
@@ -94,22 +99,23 @@
   }
 }
 
-void ReferenceHadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+template <typename OutputType>
+void ReferenceHadamard32x32(const int16_t *a, int a_stride, OutputType *b) {
   ReferenceHadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
   ReferenceHadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
   ReferenceHadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
   ReferenceHadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
 
   for (int i = 0; i < 256; ++i) {
-    const tran_low_t a0 = b[0];
-    const tran_low_t a1 = b[256];
-    const tran_low_t a2 = b[512];
-    const tran_low_t a3 = b[768];
+    const OutputType a0 = b[0];
+    const OutputType a1 = b[256];
+    const OutputType a2 = b[512];
+    const OutputType a3 = b[768];
 
-    const tran_low_t b0 = (a0 + a1) >> 2;
-    const tran_low_t b1 = (a0 - a1) >> 2;
-    const tran_low_t b2 = (a2 + a3) >> 2;
-    const tran_low_t b3 = (a2 - a3) >> 2;
+    const OutputType b0 = (a0 + a1) >> 2;
+    const OutputType b1 = (a0 - a1) >> 2;
+    const OutputType b2 = (a2 + a3) >> 2;
+    const OutputType b3 = (a2 - a3) >> 2;
 
     b[0] = b0 + b2;
     b[256] = b1 + b3;
@@ -120,51 +126,63 @@
   }
 }
 
-struct HadamardFuncWithSize {
-  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
-  HadamardFunc func;
+template <typename OutputType>
+void ReferenceHadamard(const int16_t *a, int a_stride, OutputType *b, int bwh) {
+  if (bwh == 32)
+    ReferenceHadamard32x32(a, a_stride, b);
+  else if (bwh == 16)
+    ReferenceHadamard16x16(a, a_stride, b);
+  else if (bwh == 8) {
+    ReferenceHadamard8x8(a, a_stride, b);
+  } else {
+    GTEST_FAIL() << "Invalid Hadamard transform size " << bwh << std::endl;
+  }
+}
+
+template <typename HadamardFuncType>
+struct FuncWithSize {
+  FuncWithSize(HadamardFuncType f, int s) : func(f), block_size(s) {}
+  HadamardFuncType func;
   int block_size;
 };
 
-std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+using HadamardFuncWithSize = FuncWithSize<HadamardFunc>;
+using HadamardLPFuncWithSize = FuncWithSize<HadamardLPFunc>;
+
+template <typename HadamardFuncType>
+std::ostream &operator<<(std::ostream &os,
+                         const FuncWithSize<HadamardFuncType> &hfs) {
   return os << "block size: " << hfs.block_size;
 }
 
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+template <typename OutputType, typename HadamardFuncType>
+class HadamardTestBase
+    : public ::testing::TestWithParam<FuncWithSize<HadamardFuncType>> {
  public:
-  virtual void SetUp() {
-    h_func_ = GetParam().func;
-    bwh_ = GetParam().block_size;
+  explicit HadamardTestBase(const FuncWithSize<HadamardFuncType> &func_param) {
+    h_func_ = func_param.func;
+    bwh_ = func_param.block_size;
     block_size_ = bwh_ * bwh_;
-    rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  virtual void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+
   virtual int16_t Rand() = 0;
 
-  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
-                         int bwh) {
-    if (bwh == 32)
-      ReferenceHadamard32x32(a, a_stride, b);
-    else if (bwh == 16)
-      ReferenceHadamard16x16(a, a_stride, b);
-    else
-      ReferenceHadamard8x8(a, a_stride, b);
-  }
-
   void CompareReferenceRandom() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     memset(b, 0, sizeof(b));
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     memset(b_ref, 0, sizeof(b_ref));
 
     for (int i = 0; i < block_size_; ++i) a[i] = Rand();
 
     ReferenceHadamard(a, bwh_, b_ref, bwh_);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
+    API_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
 
     // The order of the output is not important. Sort before checking.
     std::sort(b, b + block_size_);
@@ -175,17 +193,17 @@
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
-    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, b[kMaxBlockSize]);
     memset(a, 0, sizeof(a));
     for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
 
-    tran_low_t b_ref[kMaxBlockSize];
+    OutputType b_ref[kMaxBlockSize];
     for (int i = 8; i < 64; i += 8) {
       memset(b, 0, sizeof(b));
       memset(b_ref, 0, sizeof(b_ref));
 
       ReferenceHadamard(a, i, b_ref, bwh_);
-      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+      API_REGISTER_STATE_CHECK(h_func_(a, i, b));
 
       // The order of the output is not important. Sort before checking.
       std::sort(b, b + block_size_);
@@ -197,7 +215,7 @@
   void SpeedTest(int times) {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
-    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, OutputType, output[kMaxBlockSize]);
     memset(input, 1, sizeof(input));
     memset(output, 0, sizeof(output));
 
@@ -218,11 +236,12 @@
  private:
   int bwh_;
   int block_size_;
-  HadamardFunc h_func_;
+  HadamardFuncType h_func_;
 };
 
-class HadamardLowbdTest : public HadamardTestBase {
+class HadamardLowbdTest : public HadamardTestBase<tran_low_t, HadamardFunc> {
  public:
+  HadamardLowbdTest() : HadamardTestBase(GetParam()) {}
   virtual int16_t Rand() { return rnd_.Rand9Signed(); }
 };
 
@@ -230,6 +249,8 @@
 
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
+TEST_P(HadamardLowbdTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
 INSTANTIATE_TEST_SUITE_P(
     C, HadamardLowbdTest,
     ::testing::Values(HadamardFuncWithSize(&aom_hadamard_8x8_c, 8),
@@ -258,4 +279,44 @@
                       HadamardFuncWithSize(&aom_hadamard_16x16_neon, 16)));
 #endif  // HAVE_NEON
 
+// Tests for low precision
+class HadamardLowbdLPTest : public HadamardTestBase<int16_t, HadamardLPFunc> {
+ public:
+  HadamardLowbdLPTest() : HadamardTestBase(GetParam()) {}
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdLPTest, CompareReferenceRandom) {
+  CompareReferenceRandom();
+}
+
+TEST_P(HadamardLowbdLPTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdLPTest, DISABLED_SpeedTest) { SpeedTest(1000000); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_c, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_c, 16)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_sse2, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_sse2, 16)));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_avx2, 16)));
+#endif  // HAVE_AVX2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardLowbdLPTest,
+    ::testing::Values(HadamardLPFuncWithSize(&aom_hadamard_lp_8x8_neon, 8),
+                      HadamardLPFuncWithSize(&aom_hadamard_lp_16x16_neon, 16)));
+#endif  // HAVE_NEON
+
 }  // namespace

diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index 8044b51..39c2b4c 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc

@@ -88,7 +88,7 @@
 double compute_aomssim(const YV12_BUFFER_CONFIG *source,
                        const YV12_BUFFER_CONFIG *dest) {
   double ssim, weight;
-  aom_calc_ssim(source, dest, &weight, &ssim);
+  aom_lowbd_calc_ssim(source, dest, &weight, &ssim);
   return 100 * pow(ssim / weight, 8.0);
 }
 

diff --git a/test/hiprec_convolve_test_util.cc b/test/hiprec_convolve_test_util.cc
index 956af7f..04e16bc 100644
--- a/test/hiprec_convolve_test_util.cc
+++ b/test/hiprec_convolve_test_util.cc

@@ -85,7 +85,7 @@
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HiprecConvolveTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1HiprecConvolveTest::TearDown() {}
 
 void AV1HiprecConvolveTest::RunCheckOutput(hiprec_convolve_func test_impl) {
   const int w = 128, h = 128;
@@ -220,9 +220,7 @@
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdHiprecConvolveTest::TearDown() {
-  libaom_test::ClearSystemState();
-}
+void AV1HighbdHiprecConvolveTest::TearDown() {}
 
 void AV1HighbdHiprecConvolveTest::RunCheckOutput(
     highbd_hiprec_convolve_func test_impl) {

diff --git a/test/hiprec_convolve_test_util.h b/test/hiprec_convolve_test_util.h
index 6b6da4e..e064ba6 100644
--- a/test/hiprec_convolve_test_util.h
+++ b/test/hiprec_convolve_test_util.h

@@ -18,7 +18,6 @@
 
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 

diff --git a/test/horz_superres_test.cc b/test/horz_superres_test.cc
index 9733344..2f0f3fd 100644
--- a/test/horz_superres_test.cc
+++ b/test/horz_superres_test.cc

@@ -52,7 +52,7 @@
 }
 
 const TestVideoParam kTestVideoVectors[] = {
-  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.5,
+  { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 25.4,
     45.0 },
 #if CONFIG_AV1_HIGHBITDEPTH
   { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 27.0,

diff --git a/test/intra_edge_test.cc b/test/intra_edge_test.cc
index f7702c9..84e712d 100644
--- a/test/intra_edge_test.cc
+++ b/test/intra_edge_test.cc

@@ -73,7 +73,7 @@
  protected:
   void Execute(uint8_t *edge_tst) {
     params_.ref_func(edge_ref_, size_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_));
   }
 };
 
@@ -117,7 +117,7 @@
  protected:
   void Execute(uint16_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, bit_depth_));
   }
   int bit_depth_;
 };
@@ -202,7 +202,7 @@
  protected:
   void Execute(uint8_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, strength_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
   }
 };
 
@@ -240,7 +240,7 @@
  protected:
   void Execute(uint16_t *edge_tst) {
     params_.ref_func(edge_ref_, size_, strength_);
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst, size_, strength_));
   }
   int bit_depth_;
 };
@@ -284,7 +284,7 @@
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_));
   }
 }
 
@@ -298,7 +298,7 @@
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, bit_depth_));
   }
 }
 
@@ -311,7 +311,7 @@
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
     // iterate over filter strengths (1,2,3)
     strength_ = (strength_ == 3) ? 1 : strength_ + 1;
   }
@@ -328,7 +328,7 @@
   }
   edge_tst_ = &edge_tst_data_[kOffset];
   for (int iter = 0; iter < test_count; ++iter) {
-    ASM_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
+    API_REGISTER_STATE_CHECK(params_.tst_func(edge_tst_, size_, strength_));
     // iterate over filter strengths (1,2,3)
     strength_ = (strength_ == 3) ? 1 : strength_ + 1;
   }

diff --git a/test/intrabc_test.cc b/test/intrabc_test.cc
index b57eb6f..2c60596 100644
--- a/test/intrabc_test.cc
+++ b/test/intrabc_test.cc

@@ -153,8 +153,10 @@
   xd.plane[2].subsampling_x = 1;
   xd.plane[2].subsampling_y = 1;
 
+  SequenceHeader seq_params = {};
   AV1_COMMON cm;
   memset(&cm, 0, sizeof(cm));
+  cm.seq_params = &seq_params;
 
   for (const DvTestCase &dv_case : kDvCases) {
     const int mi_row = xd.tile.mi_row_start + dv_case.mi_row_offset;

diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index b04ab50..53f8c19 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc

@@ -17,7 +17,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/blockd.h"
@@ -199,7 +198,7 @@
   void Predict() {
     const int bit_depth = params_.bit_depth;
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
   void PredictRefSpeedTest(int num) {
@@ -223,7 +222,7 @@
  protected:
   void Predict() {
     params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_));
   }
   void PredictRefSpeedTest(int num) {

diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 77839fa..acc8986 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc

@@ -103,8 +103,7 @@
     const DecodeParam input = GET_PARAM(1);
     aom_codec_dec_cfg_t cfg = { 0, 0, 0, !FORCE_HIGHBITDEPTH_DECODING };
     cfg.threads = input.threads;
-    const std::string filename = input.filename;
-    libaom_test::IVFVideoSource decode_video(filename);
+    libaom_test::IVFVideoSource decode_video(input.filename);
     decode_video.Init();
 
     // The result file holds a list of expected integer results, one for each
@@ -151,6 +150,7 @@
   { 1, "invalid-oss-fuzz-10779.ivf", NULL },
   { 1, "invalid-oss-fuzz-11477.ivf", NULL },
   { 1, "invalid-oss-fuzz-11479.ivf", "invalid-oss-fuzz-11479.ivf.res.2" },
+  { 1, "invalid-oss-fuzz-33030.ivf", NULL },
 #endif
 };
 

diff --git a/test/kf_test.cc b/test/kf_test.cc
index cc2cc89..0cef8db 100644
--- a/test/kf_test.cc
+++ b/test/kf_test.cc

@@ -18,6 +18,8 @@
 #include "test/i420_video_source.h"
 #include "test/util.h"
 
+#define NUM_LAG_VALUES 3
+
 namespace {
 typedef struct {
   const unsigned int min_kf_dist;
@@ -100,10 +102,36 @@
   aom_rc_mode end_usage_check_;
 };
 
+// Because valgrind builds take a very long time to run, use a lower
+// resolution video for valgrind runs.
+const char *TestFileName() {
+#if AOM_VALGRIND_BUILD
+  return "hantro_collage_w176h144.yuv";
+#else
+  return "hantro_collage_w352h288.yuv";
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileWidth() {
+#if AOM_VALGRIND_BUILD
+  return 176;
+#else
+  return 352;
+#endif  // AOM_VALGRIND_BUILD
+}
+
+int TestFileHeight() {
+#if AOM_VALGRIND_BUILD
+  return 144;
+#else
+  return 288;
+#endif  // AOM_VALGRIND_BUILD
+}
+
 TEST_P(KeyFrameIntervalTestLarge, KeyFrameIntervalTest) {
-  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     cfg_.g_timebase.den, cfg_.g_timebase.num,
-                                     0, 75);
+  libaom_test::I420VideoSource video(TestFileName(), TestFileWidth(),
+                                     TestFileHeight(), cfg_.g_timebase.den,
+                                     cfg_.g_timebase.num, 0, 75);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   ASSERT_EQ(is_kf_interval_violated_, false) << kf_dist_param_;
 }
@@ -168,6 +196,10 @@
     return AOM_CODEC_OK == res_dec;
   }
 
+  void Frame1IsKey();
+  void ForcedFrameIsKey();
+  void ForcedFrameIsKeyCornerCases();
+
   ::libaom_test::TestMode encoding_mode_;
   int auto_alt_ref_;
   int fwd_kf_enabled_;
@@ -178,18 +210,22 @@
   bool is_kf_placement_violated_;
 };
 
-TEST_P(ForcedKeyTestLarge, Frame1IsKey) {
+void ForcedKeyTestLarge::Frame1IsKey() {
   const aom_rational timebase = { 1, 30 };
-  const int lag_values[] = { 3, 15, 25, -1 };
+  // 1st element of this 2D array is for good encoding mode and 2nd element
+  // is for RT encoding mode.
+  const int lag_values[2][NUM_LAG_VALUES] = { { 3, 15, 25 }, { 0, -1, -1 } };
+  int is_realtime = (encoding_mode_ == ::libaom_test::kRealTime);
 
   forced_kf_frame_num_ = 1;
-  for (int i = 0; lag_values[i] != -1; ++i) {
+  for (int i = 0; i < NUM_LAG_VALUES; ++i) {
+    if (lag_values[is_realtime][i] == -1) continue;
     frame_num_ = 0;
-    cfg_.g_lag_in_frames = lag_values[i];
+    cfg_.g_lag_in_frames = lag_values[is_realtime][i];
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -198,7 +234,7 @@
 
 // This class checks the presence and placement of application
 // forced key frames.
-TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) {
+void ForcedKeyTestLarge::ForcedFrameIsKey() {
   const aom_rational timebase = { 1, 30 };
   const int lag_values[] = { 3, 15, 25, -1 };
 
@@ -207,9 +243,9 @@
     forced_kf_frame_num_ = lag_values[i] - 1;
     cfg_.g_lag_in_frames = lag_values[i];
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -227,19 +263,20 @@
   }
 }
 
-TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) {
+void ForcedKeyTestLarge::ForcedFrameIsKeyCornerCases() {
   const aom_rational timebase = { 1, 30 };
   const int kf_offsets[] = { -2, -1, 1, 2, 0 };
   cfg_.g_lag_in_frames = 35;
+  if (encoding_mode_ == ::libaom_test::kRealTime) cfg_.g_lag_in_frames = 0;
 
   for (int i = 0; kf_offsets[i] != 0; ++i) {
     frame_num_ = 0;
     forced_kf_frame_num_ = (int)cfg_.kf_max_dist + kf_offsets[i];
     forced_kf_frame_num_ = forced_kf_frame_num_ > 0 ? forced_kf_frame_num_ : 1;
     is_kf_placement_violated_ = false;
-    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       timebase.den, timebase.num, 0,
-                                       fwd_kf_enabled_ ? 60 : 30);
+    libaom_test::I420VideoSource video(
+        TestFileName(), TestFileWidth(), TestFileHeight(), timebase.den,
+        timebase.num, 0, fwd_kf_enabled_ ? 60 : 30);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_EQ(is_kf_placement_violated_, false)
         << "Frame #" << frame_num_ << " isn't a keyframe!";
@@ -252,6 +289,18 @@
                            ::testing::ValuesIn(kfTestParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 
+TEST_P(ForcedKeyTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKey) { ForcedFrameIsKey(); }
+TEST_P(ForcedKeyTestLarge, ForcedFrameIsKeyCornerCases) {
+  ForcedFrameIsKeyCornerCases();
+}
+
+class ForcedKeyRTTestLarge : public ForcedKeyTestLarge {};
+
+TEST_P(ForcedKeyRTTestLarge, Frame1IsKey) { Frame1IsKey(); }
+TEST_P(ForcedKeyRTTestLarge, ForcedFrameIsKeyCornerCases) {
+  ForcedFrameIsKeyCornerCases();
+}
 // TODO(anyone): Add CBR to list of rc_modes once forced kf placement after
 // lag in frames bug is fixed.
 AV1_INSTANTIATE_TEST_SUITE(ForcedKeyTestLarge,
@@ -260,4 +309,9 @@
                            ::testing::Values(0, 1), ::testing::Values(0, 1),
                            ::testing::Values(2, 5),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CQ));
+AV1_INSTANTIATE_TEST_SUITE(ForcedKeyRTTestLarge,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Values(0), ::testing::Values(0),
+                           ::testing::Values(7, 9),
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR));
 }  // namespace

diff --git a/test/level_test.cc b/test/level_test.cc
index f512c5a..972ade0 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc

@@ -76,7 +76,7 @@
 };
 
 TEST_P(LevelTest, TestTargetLevelApi) {
-  static aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  static aom_codec_iface_t *codec = aom_codec_av1_cx();
   aom_codec_ctx_t enc;
   aom_codec_enc_cfg_t cfg;
   EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));

diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index 92ab299..c14bc06 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc

@@ -24,13 +24,14 @@
 const int kMaxPsnr = 100;
 
 class LosslessTestLarge
-    : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
-                                                 aom_rc_mode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode,
+                                                 aom_rc_mode, int>,
       public ::libaom_test::EncoderTest {
  protected:
   LosslessTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
-        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)) {}
+        encoding_mode_(GET_PARAM(1)), rc_end_usage_(GET_PARAM(2)),
+        cpu_used_(GET_PARAM(3)) {}
 
   virtual ~LosslessTestLarge() {}
 
@@ -47,6 +48,7 @@
       if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
         encoder->Control(AV1E_SET_LOSSLESS, 1);
       }
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
     }
   }
 
@@ -79,6 +81,7 @@
   unsigned int nframes_;
   libaom_test::TestMode encoding_mode_;
   aom_rc_mode rc_end_usage_;
+  int cpu_used_;
   int base_qindex_;
 };
 
@@ -136,8 +139,33 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
+class LosslessAllIntraTestLarge : public LosslessTestLarge {};
+
+TEST_P(LosslessAllIntraTestLarge, TestLossLessEncodingCtrl) {
+  const aom_rational timebase = { 33333333, 1000000000 };
+  cfg_.g_timebase = timebase;
+  // Intentionally set Q > 0, to make sure control can be used to activate
+  // lossless
+  cfg_.rc_min_quantizer = 10;
+  cfg_.rc_max_quantizer = 20;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 5);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  const double psnr_lossless = GetMinPsnr();
+  EXPECT_GE(psnr_lossless, kMaxPsnr);
+}
+
 AV1_INSTANTIATE_TEST_SUITE(LosslessTestLarge,
                            ::testing::Values(::libaom_test::kOnePassGood,
                                              ::libaom_test::kTwoPassGood),
-                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+                           ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ),
+                           ::testing::Values(0));  // cpu_used
+
+AV1_INSTANTIATE_TEST_SUITE(LosslessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(AOM_Q),
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace

diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index a30d02d..3fc3822 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc

@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/av1_loopfilter.h"
@@ -136,7 +135,7 @@
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   int bit_depth_;
@@ -200,7 +199,7 @@
     InitInput<a, b>(s, ref_s, &rnd, *limit, mask_, p, i);                      \
     call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
                 ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+    API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
                                          thresh, bit_depth_, loopfilter_op_)); \
     for (int j = 0; j < kNumCoeffs; ++j) {                                     \
       err_count += ref_s[j] != s[j];                                           \
@@ -248,7 +247,7 @@
     }                                                                          \
     call_filter(ref_s + 8 + p * 8, p, blimit, limit, thresh, bit_depth_,       \
                 ref_loopfilter_op_);                                           \
-    ASM_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
+    API_REGISTER_STATE_CHECK(call_filter(s + 8 + p * 8, p, blimit, limit,      \
                                          thresh, bit_depth_, loopfilter_op_)); \
     for (int j = 0; j < kNumCoeffs; ++j) {                                     \
       err_count += ref_s[j] != s[j];                                           \
@@ -336,7 +335,7 @@
     InitInput<a, b>(s, ref_s, &rnd, limit, mask_, p, i);                       \
     call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
                     limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
+    API_REGISTER_STATE_CHECK(                                                  \
         call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
                         limit1, thresh1, bit_depth_, loopfilter_op_));         \
     for (int j = 0; j < kNumCoeffs; ++j) {                                     \
@@ -397,7 +396,7 @@
     }                                                                          \
     call_dualfilter(ref_s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
                     limit1, thresh1, bit_depth_, ref_loopfilter_op_);          \
-    ASM_REGISTER_STATE_CHECK(                                                  \
+    API_REGISTER_STATE_CHECK(                                                  \
         call_dualfilter(s + 8 + p * 8, p, blimit0, limit0, thresh0, blimit1,   \
                         limit1, thresh1, bit_depth_, loopfilter_op_));         \
     for (int j = 0; j < kNumCoeffs; ++j) {                                     \

diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
index df7b3f8..91f7982 100644
--- a/test/masked_sad_test.cc
+++ b/test/masked_sad_test.cc

@@ -15,7 +15,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -59,7 +58,7 @@
                        int msk_stride, int inv_mask, unsigned sads[],
                        int times) = 0;
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void runMaskedSADTest(int run_times);
 };
 
@@ -133,7 +132,7 @@
                             second_pred, msk, msk_stride, invert_mask);
   } else {
     for (int repeat = 0; repeat < times; ++repeat) {
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           sads[0] = maskedSAD_op_(src_ptr, src_stride, ref_ptr[0], ref_stride,
                                   second_pred, msk, msk_stride, invert_mask));
     }
@@ -157,7 +156,7 @@
                               int msk_stride, int invert_mask, unsigned sads[],
                               int times) {
   if (times == 1) {
-    ASM_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+    API_REGISTER_STATE_CHECK(maskedSAD_op_(src_ptr, src_stride, ref_ptr,
                                            ref_stride, second_pred, msk,
                                            msk_stride, invert_mask, sads));
   } else {
@@ -254,7 +253,7 @@
     ref_maskedSAD_op_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void runHighbdMaskedSADTest(int run_times);
 
  protected:
@@ -299,7 +298,7 @@
       const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
       aom_usec_timer_start(&timer);
       if (run_times == 1) {
-        ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+        API_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
                                                      ref8_ptr, ref_stride,
                                                      second_pred8_ptr, msk_ptr,
                                                      msk_stride, invert_mask));

diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index afffce9..4a4cb1a 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc

@@ -16,7 +16,6 @@
 
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -50,7 +49,7 @@
     ref_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
@@ -94,7 +93,7 @@
           ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
                               ref_stride, second_pred_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
                                   ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
                                   msk_stride, invert_mask, &opt_sse));
@@ -147,7 +146,7 @@
           ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
                               ref_stride, second_pred_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset,
                                   ref_ptr, ref_stride, second_pred_ptr, msk_ptr,
                                   msk_stride, invert_mask, &opt_sse));
@@ -187,7 +186,7 @@
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   MaskedSubPixelVarianceFunc opt_func_;
@@ -230,7 +229,7 @@
           ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
                               ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
                                   ref8_ptr, ref_stride, second_pred8_ptr,
                                   msk_ptr, msk_stride, invert_mask, &opt_sse));
@@ -291,7 +290,7 @@
           ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
                               ref_stride, second_pred8_ptr, msk_ptr, msk_stride,
                               invert_mask, &ref_sse);
-          ASM_REGISTER_STATE_CHECK(
+          API_REGISTER_STATE_CHECK(
               opt_ret = opt_func_(src8_ptr, src_stride, xoffset, yoffset,
                                   ref8_ptr, ref_stride, second_pred8_ptr,
                                   msk_ptr, msk_stride, invert_mask, &opt_sse));

diff --git a/test/metadata_test.cc b/test/metadata_test.cc
index fd3d5c4..b7b7f14 100644
--- a/test/metadata_test.cc
+++ b/test/metadata_test.cc

@@ -34,7 +34,7 @@
 const uint8_t kMetadataPayloadCll[kMetadataPayloadSizeCll] = { 0xB5, 0x01, 0x02,
                                                                0x03 };
 
-#if CONFIG_AV1_ENCODER
+#if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 
 const size_t kMetadataObuSizeT35 = 28;
 const uint8_t kMetadataObuT35[kMetadataObuSizeT35] = {
@@ -193,7 +193,7 @@
 AV1_INSTANTIATE_TEST_SUITE(MetadataEncodeTest,
                            ::testing::Values(::libaom_test::kOnePassGood));
 
-#endif  // CONFIG_AV1_ENCODER
+#endif  // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY
 }  // namespace
 
 TEST(MetadataTest, MetadataAllocation) {

diff --git a/test/monochrome_test.cc b/test/monochrome_test.cc
index 6395c22..a71cc9b 100644
--- a/test/monochrome_test.cc
+++ b/test/monochrome_test.cc

@@ -20,16 +20,45 @@
 
 namespace {
 
+const unsigned int kCqLevel = 18;
+const double kMaxPsnr = 100.0;
+
+// kPsnrThreshold represents the psnr threshold used to validate the quality of
+// the first frame. The indices, 0 and 1 correspond to non-allintra and allintra
+// encoding modes.
+const double kPsnrThreshold[2] = { 29.0, 41.5 };
+
+// kPsnrFluctuation represents the maximum allowed psnr fluctuation w.r.t first
+// frame. The indices, 0 and 1 correspond to non-allintra and allintra encoding
+// modes.
+const double kPsnrFluctuation[2] = { 2.5, 0.3 };
+
 class MonochromeTest
-    : public ::libaom_test::CodecTestWithParam<libaom_test::TestMode>,
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
       public ::libaom_test::EncoderTest {
  protected:
-  MonochromeTest() : EncoderTest(GET_PARAM(0)), frame0_psnr_y_(0.) {}
+  MonochromeTest()
+      : EncoderTest(GET_PARAM(0)), lossless_(GET_PARAM(2)),
+        frame0_psnr_y_(0.0) {}
 
   virtual ~MonochromeTest() {}
 
   virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, GET_PARAM(3));
+      if (mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+      if (lossless_) {
+        encoder->Control(AV1E_SET_LOSSLESS, 1);
+      }
+    }
+  }
+
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      aom_codec_pts_t pts) {
     (void)pts;
@@ -68,15 +97,23 @@
   }
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Check average PSNR value is >= 100 db in case of lossless encoding.
+    if (lossless_) {
+      EXPECT_GE(pkt->data.psnr.psnr[0], kMaxPsnr);
+      return;
+    }
+    const bool is_allintra = (mode_ == ::libaom_test::kAllIntra);
     // Check that the initial Y PSNR value is 'high enough', and check that
     // subsequent Y PSNR values are 'close' to this initial value.
-    if (frame0_psnr_y_ == 0.) {
+    if (frame0_psnr_y_ == 0.0) {
       frame0_psnr_y_ = pkt->data.psnr.psnr[1];
-      EXPECT_GT(frame0_psnr_y_, 29.);
+      EXPECT_GT(frame0_psnr_y_, kPsnrThreshold[is_allintra]);
     }
-    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_, 2.5);
+    EXPECT_NEAR(pkt->data.psnr.psnr[1], frame0_psnr_y_,
+                kPsnrFluctuation[is_allintra]);
   }
 
+  int lossless_;
   std::vector<int> chroma_value_list_;
   double frame0_psnr_y_;
 };
@@ -87,9 +124,6 @@
 
   init_flags_ = AOM_CODEC_USE_PSNR;
 
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 600;
   cfg_.rc_buf_sz = 1000;
@@ -98,13 +132,10 @@
   cfg_.rc_undershoot_pct = 50;
   cfg_.rc_overshoot_pct = 50;
   cfg_.rc_end_usage = AOM_CBR;
-  cfg_.kf_mode = AOM_KF_AUTO;
   cfg_.g_lag_in_frames = 1;
   cfg_.kf_min_dist = cfg_.kf_max_dist = 3000;
   // Enable dropped frames.
   cfg_.rc_dropframe_thresh = 1;
-  // Disable error_resilience mode.
-  cfg_.g_error_resilient = 0;
   // Run at low bitrate.
   cfg_.rc_target_bitrate = 40;
   // Set monochrome encoding flag
@@ -121,8 +152,33 @@
   }
 }
 
+class MonochromeAllIntraTest : public MonochromeTest {};
+
+TEST_P(MonochromeAllIntraTest, TestMonochromeEncoding) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 5);
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  // Set monochrome encoding flag
+  cfg_.monochrome = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Check that the chroma planes are equal across all frames
+  std::vector<int>::const_iterator iter = chroma_value_list_.begin();
+  int initial_chroma_value = *iter;
+  for (; iter != chroma_value_list_.end(); ++iter) {
+    // Check that all decoded frames have the same constant chroma planes.
+    EXPECT_EQ(*iter, initial_chroma_value);
+  }
+}
+
 AV1_INSTANTIATE_TEST_SUITE(MonochromeTest,
                            ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood));
+                                             ::libaom_test::kTwoPassGood),
+                           ::testing::Values(0),   // lossless
+                           ::testing::Values(0));  // cpu_used
 
+AV1_INSTANTIATE_TEST_SUITE(MonochromeAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(0, 1),   // lossless
+                           ::testing::Values(6, 9));  // cpu_used
 }  // namespace

diff --git a/test/noise_model_test.cc b/test/noise_model_test.cc
index 8584bd8..c12c080 100644
--- a/test/noise_model_test.cc
+++ b/test/noise_model_test.cc

@@ -212,9 +212,10 @@
   aom_noise_strength_solver_free(&solver);
 }
 
-TEST(NoiseStrengthLut, LutInitNegativeSize) {
+TEST(NoiseStrengthLut, LutInitNegativeOrZeroSize) {
   aom_noise_strength_lut_t lut;
   ASSERT_FALSE(aom_noise_strength_lut_init(&lut, -1));
+  ASSERT_FALSE(aom_noise_strength_lut_init(&lut, 0));
 }
 
 TEST(NoiseStrengthLut, LutEvalSinglePoint) {

diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index a8290b2..9b70366 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc

@@ -55,7 +55,7 @@
 
     const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
+    API_REGISTER_STATE_CHECK(tst_res =
                                  params_.tst_func(pre, pre_stride, wsrc, mask));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -78,7 +78,7 @@
 
     const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res =
+    API_REGISTER_STATE_CHECK(tst_res =
                                  params_.tst_func(pre, pre_stride, wsrc, mask));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -172,7 +172,7 @@
     const unsigned int ref_res =
         params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res =
             params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
 
@@ -197,7 +197,7 @@
     const unsigned int ref_res =
         params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res =
             params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
 

diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index 58d2ad6..03b38f7 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc

@@ -60,7 +60,7 @@
     const unsigned int ref_res =
         params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -86,7 +86,7 @@
     const unsigned int ref_res =
         params_.ref_func(pre, pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
 
     ASSERT_EQ(ref_res, tst_res);
@@ -219,7 +219,7 @@
     const unsigned int ref_res = params_.ref_func(
         CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
                                                         pre_stride, wsrc, mask,
                                                         &tst_sse));
 
@@ -246,7 +246,7 @@
     const unsigned int ref_res = params_.ref_func(
         CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask, &ref_sse);
     unsigned int tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(CONVERT_TO_BYTEPTR(pre),
                                                         pre_stride, wsrc, mask,
                                                         &tst_sse));
 

diff --git a/test/quant_test.cc b/test/quant_test.cc
index 9fca953..a042af1 100644
--- a/test/quant_test.cc
+++ b/test/quant_test.cc

@@ -20,6 +20,13 @@
 
 namespace {
 
+const ::libaom_test::TestMode kTestMode[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood };
+#endif
+
 class QMTest
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
@@ -41,6 +48,11 @@
       encoder->Control(AV1E_SET_QM_MAX, qm_max_);
 
       encoder->Control(AOME_SET_MAX_INTRA_BITRATE_PCT, 100);
+      if (mode_ == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+        encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
+      }
     }
   }
 
@@ -75,11 +87,10 @@
 // encodes and decodes without a mismatch.
 TEST_P(QMTest, TestNoMisMatchQM3) { DoTest(9, 15); }
 
-AV1_INSTANTIATE_TEST_SUITE(QMTest,
-                           ::testing::Values(::libaom_test::kRealTime,
-                                             ::libaom_test::kOnePassGood),
+AV1_INSTANTIATE_TEST_SUITE(QMTest, ::testing::ValuesIn(kTestMode),
                            ::testing::Range(5, 9));
 
+#if !CONFIG_REALTIME_ONLY
 typedef struct {
   const unsigned int min_q;
   const unsigned int max_q;
@@ -173,4 +184,5 @@
                                              ::libaom_test::kTwoPassGood),
                            ::testing::ValuesIn(QuantTestParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
+#endif  // !CONFIG_REALTIME_ONLY
 }  // namespace

diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 3d79cf8..f3f5c85 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc

@@ -22,7 +22,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/common/scan.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -100,7 +99,6 @@
     qtab_ = NULL;
     aom_free(coeff_);
     coeff_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
   void InitQuantizer() {
@@ -159,7 +157,7 @@
                  qcoeff_ref, dqcoeff_ref, dequant, &eob[0], sc->scan,
                  sc->iscan);
 
-      ASM_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
+      API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, zbin, round, quant,
                                       quant_shift, qcoeff, dqcoeff, dequant,
                                       &eob[1], sc->scan, sc->iscan));
 
@@ -270,7 +268,7 @@
                quant_shift_ptr, qcoeff_ref_ptr, dqcoeff_ref_ptr, dequant_ptr,
                eob_ref_ptr, scan, iscan);
 
-    ASM_REGISTER_STATE_CHECK(quant_(
+    API_REGISTER_STATE_CHECK(quant_(
         coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
         qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan));
   }
@@ -589,4 +587,5 @@
                    static_cast<TX_SIZE>(TX_32X32), TYPE_B, AOM_BITS_8)));
 
 #endif  // HAVE_AVX
+
 }  // namespace

diff --git a/test/quantize_lp_func_test.cc b/test/quantize_lp_func_test.cc
new file mode 100644
index 0000000..898b810
--- /dev/null
+++ b/test/quantize_lp_func_test.cc

@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/common/scan.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+using libaom_test::ACMRandom;
+
+#define QUAN_LP_PARAM_LIST                                                 \
+  const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,   \
+      const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, \
+      const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,  \
+      const int16_t *iscan
+
+typedef void (*QuantizeFunc)(QUAN_LP_PARAM_LIST);
+
+using std::tuple;
+typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, aom_bit_depth_t>
+    QuantizeParam;
+
+typedef struct {
+  QUANTS quant;
+  Dequants dequant;
+} QuanTable;
+
+const int kTestNum = 1000;
+
+template <typename CoeffType>
+class QuantizeTestBase : public ::testing::TestWithParam<QuantizeParam> {
+ protected:
+  QuantizeTestBase()
+      : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)),
+        bd_(GET_PARAM(3)) {}
+
+  virtual ~QuantizeTestBase() {}
+
+  virtual void SetUp() {
+    qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_)));
+    const int n_coeffs = coeff_num();
+    coeff_ = reinterpret_cast<CoeffType *>(
+        aom_memalign(32, 6 * n_coeffs * sizeof(CoeffType)));
+    InitQuantizer();
+  }
+
+  virtual void TearDown() {
+    aom_free(qtab_);
+    qtab_ = NULL;
+    aom_free(coeff_);
+    coeff_ = NULL;
+  }
+
+  void InitQuantizer() {
+    av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant);
+  }
+
+  virtual void RunQuantizeFunc(const CoeffType *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr, CoeffType *qcoeff_ptr,
+                               CoeffType *qcoeff_ref_ptr,
+                               CoeffType *dqcoeff_ptr,
+                               CoeffType *dqcoeff_ref_ptr,
+                               const int16_t *dequant_ptr,
+                               uint16_t *eob_ref_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) = 0;
+
+  void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) {
+    CoeffType *coeff_ptr = coeff_;
+    const intptr_t n_coeffs = coeff_num();
+
+    CoeffType *qcoeff_ref = coeff_ptr + n_coeffs;
+    CoeffType *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+    CoeffType *qcoeff = dqcoeff_ref + n_coeffs;
+    CoeffType *dqcoeff = qcoeff + n_coeffs;
+    uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+    // Testing uses 2-D DCT scan order table
+    const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+    // Testing uses luminance quantization table
+    const int16_t *round = 0;
+    const int16_t *quant = 0;
+    round = qtab_->quant.y_round_fp[q];
+    quant = qtab_->quant.y_quant_fp[q];
+
+    const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+
+    for (int i = 0; i < test_num; ++i) {
+      if (is_loop) FillCoeffRandom();
+
+      memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref));
+
+      RunQuantizeFunc(coeff_ptr, n_coeffs, round, quant, qcoeff, qcoeff_ref,
+                      dqcoeff, dqcoeff_ref, dequant, &eob[0], &eob[1], sc->scan,
+                      sc->iscan);
+
+      quant_ref_(coeff_ptr, n_coeffs, round, quant, qcoeff_ref, dqcoeff_ref,
+                 dequant, &eob[0], sc->scan, sc->iscan);
+
+      API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round, quant, qcoeff,
+                                      dqcoeff, dequant, &eob[1], sc->scan,
+                                      sc->iscan));
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(qcoeff_ref[j], qcoeff[j])
+            << "Q mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      for (int j = 0; j < n_coeffs; ++j) {
+        ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j])
+            << "Dq mismatch on test: " << i << " at position: " << j
+            << " Q: " << q << " coeff: " << coeff_ptr[j];
+      }
+
+      ASSERT_EQ(eob[0], eob[1])
+          << "eobs mismatch on test: " << i << " Q: " << q;
+    }
+  }
+
+  void CompareResults(const CoeffType *buf_ref, const CoeffType *buf, int size,
+                      const char *text, int q, int number) {
+    int i;
+    for (i = 0; i < size; ++i) {
+      ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number
+                                    << " at position: " << i << " Q: " << q;
+    }
+  }
+
+  int coeff_num() const { return av1_get_max_eob(tx_size_); }
+
+  void FillCoeff(CoeffType c) {
+    const int n_coeffs = coeff_num();
+    for (int i = 0; i < n_coeffs; ++i) {
+      coeff_[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    const int n_coeffs = coeff_num();
+    FillCoeffZero();
+    int num = rnd_.Rand16() % n_coeffs;
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
+  void FillCoeffRandomRows(int num) {
+    FillCoeffZero();
+    for (int i = 0; i < num; ++i) {
+      coeff_[i] = GetRandomCoeff();
+    }
+  }
+
+  void FillCoeffZero() { FillCoeff(0); }
+
+  void FillCoeffConstant() {
+    CoeffType c = GetRandomCoeff();
+    FillCoeff(c);
+  }
+
+  void FillDcOnly() {
+    FillCoeffZero();
+    coeff_[0] = GetRandomCoeff();
+  }
+
+  void FillDcLargeNegative() {
+    FillCoeffZero();
+    // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+    // like BUG=883 where the constant being compared was incorrectly
+    // initialized.
+    coeff_[0] = -8191;
+  }
+
+  CoeffType GetRandomCoeff() {
+    CoeffType coeff;
+    if (bd_ == AOM_BITS_8) {
+      coeff =
+          clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX);
+    } else {
+      CoeffType min = -(1 << (7 + bd_));
+      CoeffType max = -min - 1;
+      coeff = clamp(static_cast<CoeffType>(rnd_.Rand31()), min, max);
+    }
+    return coeff;
+  }
+
+  ACMRandom rnd_;
+  QuanTable *qtab_;
+  CoeffType *coeff_;
+  QuantizeFunc quant_ref_;
+  QuantizeFunc quant_;
+  TX_SIZE tx_size_;
+  aom_bit_depth_t bd_;
+};
+
+class FullPrecisionQuantizeLpTest : public QuantizeTestBase<int16_t> {
+  void RunQuantizeFunc(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       int16_t *qcoeff_ptr, int16_t *qcoeff_ref_ptr,
+                       int16_t *dqcoeff_ptr, int16_t *dqcoeff_ref_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ref_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const int16_t *iscan) override {
+    quant_ref_(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ref_ptr,
+               dqcoeff_ref_ptr, dequant_ptr, eob_ref_ptr, scan, iscan);
+
+    API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
+                                    qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                    eob_ptr, scan, iscan));
+  }
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(FullPrecisionQuantizeLpTest);
+
+TEST_P(FullPrecisionQuantizeLpTest, ZeroInput) {
+  FillCoeffZero();
+  QuantizeRun(false);
+}
+
+TEST_P(FullPrecisionQuantizeLpTest, LargeNegativeInput) {
+  FillDcLargeNegative();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(FullPrecisionQuantizeLpTest, DcOnlyInput) {
+  FillDcOnly();
+  QuantizeRun(false, 0, 1);
+}
+
+TEST_P(FullPrecisionQuantizeLpTest, RandomInput) {
+  QuantizeRun(true, 0, kTestNum);
+}
+
+TEST_P(FullPrecisionQuantizeLpTest, MultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    QuantizeRun(true, q, kTestNum);
+  }
+}
+
+// Force the coeff to be half the value of the dequant.  This exposes a
+// mismatch found in av1_quantize_fp_sse2().
+TEST_P(FullPrecisionQuantizeLpTest, CoeffHalfDequant) {
+  FillCoeff(16);
+  QuantizeRun(false, 25, 1);
+}
+
+TEST_P(FullPrecisionQuantizeLpTest, DISABLED_Speed) {
+  int16_t *coeff_ptr = coeff_;
+  const intptr_t n_coeffs = coeff_num();
+
+  int16_t *qcoeff_ref = coeff_ptr + n_coeffs;
+  int16_t *dqcoeff_ref = qcoeff_ref + n_coeffs;
+
+  int16_t *qcoeff = dqcoeff_ref + n_coeffs;
+  int16_t *dqcoeff = qcoeff + n_coeffs;
+  uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs);
+
+  // Testing uses 2-D DCT scan order table
+  const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT);
+
+  // Testing uses luminance quantization table
+  const int q = 22;
+  const int16_t *round_fp = qtab_->quant.y_round_fp[q];
+  const int16_t *quant_fp = qtab_->quant.y_quant_fp[q];
+  const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
+  const int kNumTests = 5000000;
+  aom_usec_timer timer, simd_timer;
+  int rows = tx_size_high[tx_size_];
+  int cols = tx_size_wide[tx_size_];
+  rows = AOMMIN(32, rows);
+  cols = AOMMIN(32, cols);
+  for (int cnt = 0; cnt <= rows; cnt++) {
+    FillCoeffRandomRows(cnt * cols);
+
+    aom_usec_timer_start(&timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_ref_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff,
+                 dequant, eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&timer);
+
+    aom_usec_timer_start(&simd_timer);
+    for (int n = 0; n < kNumTests; ++n) {
+      quant_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff, dequant,
+             eob, sc->scan, sc->iscan);
+    }
+    aom_usec_timer_mark(&simd_timer);
+
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    const int simd_elapsed_time =
+        static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+    printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time,
+           simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time));
+  }
+}
+
+using std::make_tuple;
+
+#if HAVE_AVX2
+const QuantizeParam kQParamArrayAVX2[] = {
+  // av1_quantize_lp is only called in nonrd_pickmode.c, and is used for 16X16,
+  // 8X8, and 4X4.
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2,
+             static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeLpTest,
+                         ::testing::ValuesIn(kQParamArrayAVX2));
+#endif
+
+#if HAVE_SSE2
+const QuantizeParam kQParamArraySSE2[] = {
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8),
+  make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_sse2,
+             static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(SSE2, FullPrecisionQuantizeLpTest,
+                         ::testing::ValuesIn(kQParamArraySSE2));
+#endif
+
+}  // namespace

diff --git a/test/ratectrl_test.cc b/test/ratectrl_test.cc
new file mode 100644
index 0000000..4b462e3
--- /dev/null
+++ b/test/ratectrl_test.cc

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/tpl_model.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RatectrlTest, QModeGetQIndexTest) {
+  int base_q_index = 36;
+  int gf_update_type = INTNL_ARF_UPDATE;
+  int gf_pyramid_level = 1;
+  int arf_q = 100;
+  int q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                       gf_pyramid_level, arf_q);
+  EXPECT_EQ(q_index, arf_q);
+
+  gf_update_type = INTNL_ARF_UPDATE;
+  gf_pyramid_level = 3;
+  q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                   gf_pyramid_level, arf_q);
+  EXPECT_LT(q_index, arf_q);
+
+  gf_update_type = LF_UPDATE;
+  q_index = av1_q_mode_get_q_index(base_q_index, gf_update_type,
+                                   gf_pyramid_level, arf_q);
+  EXPECT_EQ(q_index, base_q_index);
+}
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(angiebird): Move this test to tpl_mode_test.cc
+TEST(RatectrlTest, QModeComputeGOPQIndicesTest) {
+  const int base_q_index = 80;
+  double qstep_ratio_list[5] = { 0.5, 1, 1, 1, 0.5 };
+  const aom_bit_depth_t bit_depth = AOM_BITS_8;
+
+  const int gf_frame_index = 0;
+  GF_GROUP gf_group = {};
+  gf_group.size = 5;
+  const int layer_depth[5] = { 1, 3, 2, 3, 1 };
+  const int update_type[5] = { KF_UPDATE, INTNL_ARF_UPDATE,
+                               INTNL_OVERLAY_UPDATE, INTNL_ARF_UPDATE,
+                               ARF_UPDATE };
+
+  for (int i = 0; i < gf_group.size; i++) {
+    gf_group.layer_depth[i] = layer_depth[i];
+    gf_group.update_type[i] = update_type[i];
+  }
+
+  const int arf_q = av1_get_q_index_from_qstep_ratio(
+      base_q_index, qstep_ratio_list[0], bit_depth);
+
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, base_q_index,
+                                   qstep_ratio_list, bit_depth, &gf_group,
+                                   gf_group.q_val);
+
+  for (int i = 0; i < gf_group.size; i++) {
+    if (layer_depth[i] == 1) {
+      EXPECT_EQ(gf_group.q_val[i], arf_q);
+    } else {
+      EXPECT_GT(gf_group.q_val[i], arf_q);
+    }
+  }
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+}  // namespace

diff --git a/test/rd_test.cc b/test/rd_test.cc
new file mode 100644
index 0000000..0c481fc
--- /dev/null
+++ b/test/rd_test.cc

@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <vector>
+
+#include "av1/common/quant_common.h"
+#include "av1/encoder/rd.h"
+#include "aom/aom_codec.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+TEST(RdTest, GetDeltaqOffsetValueTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 4;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 16);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetValueTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1.0 / 4.0;
+  int q_index = 29;
+  int dc_q_step =
+      av1_dc_quant_QTX(q_index, 0, static_cast<aom_bit_depth_t>(bit_depth));
+  EXPECT_EQ(dc_q_step, 32);
+
+  int ref_new_dc_q_step = static_cast<int>(round(dc_q_step / sqrt(beta)));
+  EXPECT_EQ(ref_new_dc_q_step, 64);
+
+  int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+  int new_dc_q_step = av1_dc_quant_QTX(q_index, delta_q,
+                                       static_cast<aom_bit_depth_t>(bit_depth));
+
+  EXPECT_EQ(new_dc_q_step, ref_new_dc_q_step);
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 0.000000001;
+  std::vector<int> q_index_ls = { 254, 255 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 255);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetBoundaryTest2) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 100;
+  std::vector<int> q_index_ls = { 1, 0 };
+  for (auto q_index : q_index_ls) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(q_index + delta_q, 0);
+  }
+}
+
+TEST(RdTest, GetDeltaqOffsetUnitaryTest1) {
+  aom_bit_depth_t bit_depth = AOM_BITS_8;
+  double beta = 1;
+  for (int q_index = 0; q_index < 255; ++q_index) {
+    int delta_q = av1_get_deltaq_offset(bit_depth, q_index, beta);
+    EXPECT_EQ(delta_q, 0);
+  }
+}
+
+}  // namespace

diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 7e440c9..ec97db7 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc

@@ -21,7 +21,6 @@
 #include "av1/common/scan.h"
 #include "av1/common/txb_common.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
@@ -51,7 +50,7 @@
  public:
   virtual ~BuildCompDiffwtdMaskTest() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void RunTest(buildcompdiffwtdmaskd_func test_impl, const int is_speed,
                const DIFFWTD_MASK_TYPE type);
 
@@ -79,7 +78,7 @@
     : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
  public:
   ~BuildCompDiffwtdMaskD16Test() {}
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
   void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
  protected:

diff --git a/test/register_state_check.h b/test/register_state_check.h
index d404621..4bf5120 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h

@@ -18,18 +18,13 @@
 
 #include "aom/aom_integer.h"
 
-// ASM_REGISTER_STATE_CHECK(asm_function)
-//   Minimally validates the environment pre & post function execution. This
-//   variant should be used with assembly functions which are not expected to
-//   fully restore the system state. See platform implementations of
-//   RegisterStateCheck for details.
-//
-// API_REGISTER_STATE_CHECK(api_function)
-//   Performs all the checks done by ASM_REGISTER_STATE_CHECK() and any
-//   additional checks to ensure the environment is in a consistent state pre &
-//   post function execution. This variant should be used with API functions.
-//   See platform implementations of RegisterStateCheckXXX for details.
-//
+// API_REGISTER_STATE_CHECK(function)
+//   Validates the environment pre & post function execution to ensure the
+//   environment is in a consistent state. This should be used with API
+//   function sand assembly functions which are not expected to fully restore
+//   the system state.
+//   See platform implementations of RegisterStateCheck and
+//   RegisterStateCheckMMX for details.
 
 #if defined(_WIN64) && ARCH_X86_64
 
@@ -81,13 +76,6 @@
   bool initialized_;
   CONTEXT pre_context_;
 };
-
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libaom_test::RegisterStateCheck reg_check; \
-    statement;                                 \
-  } while (false)
-
 }  // namespace libaom_test
 
 #else
@@ -95,15 +83,11 @@
 namespace libaom_test {
 
 class RegisterStateCheck {};
-#define ASM_REGISTER_STATE_CHECK(statement) statement
-
 }  // namespace libaom_test
 
 #endif  // _WIN64 && ARCH_X86_64
 
-#if ARCH_X86 || ARCH_X86_64
-#if defined(__GNUC__)
-
+#if (ARCH_X86 || ARCH_X86_64) && defined(__GNUC__)
 namespace libaom_test {
 
 // Checks the FPU tag word pre/post execution to ensure emms has been called.
@@ -129,20 +113,23 @@
 
   uint16_t pre_fpu_env_[14];
 };
-
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libaom_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
-  } while (false)
-
 }  // namespace libaom_test
 
-#endif  // __GNUC__
-#endif  // ARCH_X86 || ARCH_X86_64
+#else
+namespace libaom_test {
 
-#ifndef API_REGISTER_STATE_CHECK
-#define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
-#endif
+class RegisterStateCheckMMX {};
+}  // namespace libaom_test
+
+#endif  // (ARCH_X86 || ARCH_X86_64) && defined(__GNUC__)
+
+#define API_REGISTER_STATE_CHECK(statement)           \
+  do {                                                \
+    libaom_test::RegisterStateCheck reg_check;        \
+    libaom_test::RegisterStateCheckMMX reg_check_mmx; \
+    statement;                                        \
+    (void)reg_check_mmx;                              \
+    (void)reg_check;                                  \
+  } while (false)
 
 #endif  // AOM_TEST_REGISTER_STATE_CHECK_H_

diff --git a/test/resize_test.cc b/test/resize_test.cc
index cb09a9a..68d6101 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc

@@ -203,6 +203,17 @@
 
   virtual void SetUp() { InitializeConfig(GET_PARAM(1)); }
 
+  virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
+                                  libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      if (GET_PARAM(1) == ::libaom_test::kRealTime) {
+        encoder->Control(AV1E_SET_AQ_MODE, 3);
+        encoder->Control(AOME_SET_CPUUSED, 5);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+      }
+    }
+  }
+
   virtual void DecompressedFrameHook(const aom_image_t &img,
                                      aom_codec_pts_t pts) {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
@@ -241,6 +252,7 @@
 const unsigned int kStepDownFrame = 3;
 const unsigned int kStepUpFrame = 6;
 
+#if !CONFIG_REALTIME_ONLY
 class ResizeInternalTestLarge : public ResizeTest {
  protected:
 #if WRITE_COMPRESSED_STREAM
@@ -362,6 +374,10 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
+                           ::testing::Values(::libaom_test::kOnePassGood));
+#endif
+
 class ResizeRealtimeTest
     : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
       public ::libaom_test::EncoderTest {
@@ -375,6 +391,9 @@
                                   libaom_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(AV1E_SET_AQ_MODE, 3);
+      encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0);
+      encoder->Control(AV1E_SET_ENABLE_OBMC, 0);
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
     }
@@ -786,6 +805,7 @@
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 // This class is used to check if there are any fatal
 // failures while encoding with resize-mode > 0
 class ResizeModeTestLarge
@@ -833,16 +853,6 @@
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
-                           ::testing::Values(::libaom_test::kRealTime));
-AV1_INSTANTIATE_TEST_SUITE(ResizeInternalTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood));
-AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
-                           ::testing::Values(::libaom_test::kRealTime),
-                           ::testing::Range(5, 10));
-AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
-                           ::testing::Values(::libaom_test::kRealTime));
-
 // TODO(anyone): Enable below test once resize issues are fixed
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(ResizeModeTestLarge);
 // AV1_INSTANTIATE_TEST_SUITE(
@@ -851,4 +861,14 @@
 //    ::libaom_test::kTwoPassGood),
 //    ::testing::Values(1, 2), ::testing::Values(8, 12, 16),
 //    ::testing::Values(8, 12, 16), ::testing::Range(2, 7));
+#endif  // !CONFIG_REALTIME_ONLY
+
+AV1_INSTANTIATE_TEST_SUITE(ResizeTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+AV1_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
+                           ::testing::Values(::libaom_test::kRealTime),
+                           ::testing::Range(6, 10));
+AV1_INSTANTIATE_TEST_SUITE(ResizeCspTest,
+                           ::testing::Values(::libaom_test::kRealTime));
+
 }  // namespace

diff --git a/test/rt_end_to_end_test.cc b/test/rt_end_to_end_test.cc
index e8a1a40..5e360f2 100644
--- a/test/rt_end_to_end_test.cc
+++ b/test/rt_end_to_end_test.cc

@@ -36,19 +36,22 @@
                            { 6, { { 0, 35.3 }, { 3, 36.2 } } },
                            { 7, { { 0, 34.9 }, { 3, 35.8 } } },
                            { 8, { { 0, 35.0 }, { 3, 35.8 } } },
-                           { 9, { { 0, 34.9 }, { 3, 35.5 } } } } },
+                           { 9, { { 0, 34.9 }, { 3, 35.5 } } },
+                           { 10, { { 0, 34.7 }, { 3, 35.3 } } } } },
                        { "paris_352_288_30.y4m",
                          { { 5, { { 0, 36.2 }, { 3, 36.7 } } },
                            { 6, { { 0, 36.1 }, { 3, 36.5 } } },
                            { 7, { { 0, 35.5 }, { 3, 36.0 } } },
                            { 8, { { 0, 36.0 }, { 3, 36.5 } } },
-                           { 9, { { 0, 35.5 }, { 3, 36.1 } } } } },
+                           { 9, { { 0, 35.5 }, { 3, 36.0 } } },
+                           { 10, { { 0, 35.3 }, { 3, 35.9 } } } } },
                        { "niklas_1280_720_30.y4m",
-                         { { 5, { { 0, 34.4 }, { 3, 34.4 } } },
+                         { { 5, { { 0, 34.4 }, { 3, 34.30 } } },
                            { 6, { { 0, 34.2 }, { 3, 34.2 } } },
                            { 7, { { 0, 33.6 }, { 3, 33.6 } } },
                            { 8, { { 0, 33.48 }, { 3, 33.48 } } },
-                           { 9, { { 0, 33.4 }, { 3, 33.4 } } } } } };
+                           { 9, { { 0, 33.4 }, { 3, 33.4 } } },
+                           { 10, { { 0, 33.2 }, { 3, 33.2 } } } } } };
 
 typedef struct {
   const char *filename;
@@ -125,6 +128,7 @@
       encoder->Control(AV1E_SET_COEFF_COST_UPD_FREQ, 2);
       encoder->Control(AV1E_SET_MODE_COST_UPD_FREQ, 2);
       encoder->Control(AV1E_SET_MV_COST_UPD_FREQ, 2);
+      encoder->Control(AV1E_SET_DV_COST_UPD_FREQ, 2);
     }
   }
 
@@ -175,13 +179,13 @@
 TEST_P(RTEndToEndTestThreaded, EndtoEndPSNRTest) { DoTest(); }
 
 AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTest, ::testing::ValuesIn(kTestVectors),
-                           ::testing::Range(5, 10),
+                           ::testing::Range(5, 11),
                            ::testing::Values<unsigned int>(0, 3),
                            ::testing::Values(1), ::testing::Values(1));
 
 AV1_INSTANTIATE_TEST_SUITE(RTEndToEndTestThreaded,
                            ::testing::ValuesIn(kTestVectors),
-                           ::testing::Range(5, 10),
+                           ::testing::Range(5, 11),
                            ::testing::Values<unsigned int>(0, 3),
                            ::testing::Range(2, 5), ::testing::Range(2, 5));
 }  // namespace

diff --git a/test/sad_test.cc b/test/sad_test.cc
index afd84a8..a73c849 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "aom/aom_codec.h"
@@ -129,7 +128,7 @@
     comp_pred16_test_ = NULL;
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   // Handle up to 4 128x128 blocks, with stride up to 256
@@ -363,7 +362,7 @@
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
         source_data_, source_stride_, references, reference_stride_, results));
   }
 
@@ -398,7 +397,7 @@
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
         source_data_, source_stride_, references, reference_stride_, results));
   }
 
@@ -433,7 +432,7 @@
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,
                                           references, reference_stride_,
                                           second_pred_, results));
   }
@@ -469,7 +468,7 @@
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_));
     return ret;
   }
@@ -500,7 +499,7 @@
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_));
     return ret;
   }
@@ -531,7 +530,7 @@
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 second_pred_));
     return ret;
@@ -556,7 +555,7 @@
   void dist_wtd_comp_avg(int block_idx) {
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
+    API_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
                                           height_, reference, reference_stride_,
                                           &jcp_param_));
   }
@@ -564,8 +563,8 @@
   void CheckCompAvg() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         ReferenceDistWtdCompAvg(0);
         dist_wtd_comp_avg(0);
@@ -589,7 +588,7 @@
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 GET_PARAM(0), GET_PARAM(1)));
     return ret;
@@ -623,7 +622,7 @@
     unsigned int ret;
     const uint8_t *const reference = GetReference(block_idx);
 
-    ASM_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
+    API_REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,
                                                 reference, reference_stride_,
                                                 second_pred_, &jcp_param_));
     return ret;
@@ -632,8 +631,8 @@
   void CheckSAD() {
     for (int j = 0; j < 2; ++j) {
       for (int i = 0; i < 4; ++i) {
-        jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
-        jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
+        jcp_param_.fwd_offset = quant_dist_lookup_table[i][j];
+        jcp_param_.bck_offset = quant_dist_lookup_table[i][1 - j];
 
         const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
         const unsigned int exp_sad = dist_wtd_SAD_avg(0);
@@ -705,9 +704,7 @@
   source_stride_ = tmp_stride;
 }
 
-#define SPEED_TEST (0)
-#if SPEED_TEST
-TEST_P(SADTest, Speed) {
+TEST_P(SADTest, DISABLED_Speed) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -715,7 +712,6 @@
   SpeedSAD();
   source_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADSkipTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -762,8 +758,7 @@
   source_stride_ = tmp_stride;
 }
 
-#if SPEED_TEST
-TEST_P(SADSkipTest, Speed) {
+TEST_P(SADSkipTest, DISABLED_Speed) {
   const int tmp_stride = source_stride_;
   source_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -771,7 +766,6 @@
   SpeedSAD();
   source_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
@@ -1020,8 +1014,7 @@
   source_data_ = tmp_source_data;
 }
 
-#if SPEED_TEST
-TEST_P(SADx4Test, Speed) {
+TEST_P(SADx4Test, DISABLED_Speed) {
   FillRandom(source_data_, source_stride_);
   FillRandom(GetReference(0), reference_stride_);
   FillRandom(GetReference(1), reference_stride_);
@@ -1029,7 +1022,6 @@
   FillRandom(GetReference(3), reference_stride_);
   SpeedSAD();
 }
-#endif
 
 // SADSkipx4
 TEST_P(SADSkipx4Test, MaxRef) {
@@ -1104,8 +1096,7 @@
   source_data_ = tmp_source_data;
 }
 
-#if SPEED_TEST
-TEST_P(SADSkipx4Test, Speed) {
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
   FillRandom(source_data_, source_stride_);
   FillRandom(GetReference(0), reference_stride_);
   FillRandom(GetReference(1), reference_stride_);
@@ -1113,12 +1104,10 @@
   FillRandom(GetReference(3), reference_stride_);
   SpeedSAD();
 }
-#endif
 
 using std::make_tuple;
 
-#if SPEED_TEST
-TEST_P(SADx4AvgTest, Speed) {
+TEST_P(SADx4AvgTest, DISABLED_Speed) {
   int tmp_stride = reference_stride_;
   reference_stride_ >>= 1;
   FillRandom(source_data_, source_stride_);
@@ -1130,7 +1119,6 @@
   SpeedSAD();
   reference_stride_ = tmp_stride;
 }
-#endif
 
 TEST_P(SADx4AvgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);

diff --git a/test/selfguided_filter_test.cc b/test/selfguided_filter_test.cc
index 48ec461..94e89ae 100644
--- a/test/selfguided_filter_test.cc
+++ b/test/selfguided_filter_test.cc

@@ -17,7 +17,6 @@
 #include "config/av1_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 
@@ -45,7 +44,7 @@
   virtual ~AV1SelfguidedFilterTest() {}
   virtual void SetUp() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunSpeedTest() {
@@ -231,7 +230,7 @@
   virtual ~AV1HighbdSelfguidedFilterTest() {}
   virtual void SetUp() {}
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   void RunSpeedTest() {

diff --git a/test/sharpness_test.cc b/test/sharpness_test.cc
new file mode 100644
index 0000000..d50d0a3
--- /dev/null
+++ b/test/sharpness_test.cc

@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <unordered_map>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+
+namespace {
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+const unsigned int kCqLevel = 18;
+
+// List of psnr thresholds for different test combinations
+// keys: test-mode, cpu-used, sharpness.
+const std::unordered_map<
+    int, std::unordered_map<int, std::unordered_map<int, double>>>
+    kPsnrThreshold = { { static_cast<int>(::libaom_test::kTwoPassGood),
+                         { { 2, { { 2, 37.6 }, { 5, 37.6 } } },
+                           { 4, { { 2, 37.5 }, { 5, 37.5 } } },
+                           { 6, { { 2, 37.5 }, { 5, 37.5 } } } } },
+                       { static_cast<int>(::libaom_test::kAllIntra),
+                         { { 3, { { 2, 42.2 }, { 5, 42.2 } } },
+                           { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } },
+                           { 9, { { 2, 41.1 }, { 5, 41.1 } } } } } };
+
+// This class is used to test sharpness parameter configured through control
+// call using AOME_SET_SHARPNESS for different encoder configurations.
+class SharpnessTest
+    : public ::libaom_test::CodecTestWith3Params<libaom_test::TestMode, int,
+                                                 int>,
+      public ::libaom_test::EncoderTest {
+ protected:
+  SharpnessTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), sharpness_level_(GET_PARAM(3)), psnr_(0.0),
+        nframes_(0) {}
+
+  ~SharpnessTest() override {}
+
+  void SetUp() override {
+    InitializeConfig(encoding_mode_);
+    if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+      cfg_.rc_target_bitrate = kBitrate;
+      cfg_.g_lag_in_frames = 5;
+    }
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                          ::libaom_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_SHARPNESS, sharpness_level_);
+      if (encoding_mode_ == ::libaom_test::kTwoPassGood) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+      } else if (encoding_mode_ == ::libaom_test::kAllIntra) {
+        encoder->Control(AOME_SET_CQ_LEVEL, kCqLevel);
+      }
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+  double GetPsnrThreshold() {
+    return kPsnrThreshold.at(encoding_mode_).at(cpu_used_).at(sharpness_level_);
+  }
+
+  void DoTest() {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+
+    std::unique_ptr<libaom_test::VideoSource> video(
+        new libaom_test::Y4mVideoSource("paris_352_288_30.y4m", 0, kFrames));
+    ASSERT_TRUE(video.get() != NULL);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+    const double psnr = GetAveragePsnr();
+    EXPECT_GT(psnr, GetPsnrThreshold())
+        << "encoding mode = " << encoding_mode_ << ", cpu used = " << cpu_used_
+        << ", sharpness level = " << sharpness_level_;
+  }
+
+ private:
+  const libaom_test::TestMode encoding_mode_;
+  const int cpu_used_;
+  const int sharpness_level_;
+  double psnr_;
+  unsigned int nframes_;
+};
+
+class SharpnessTestLarge : public SharpnessTest {};
+
+class SharpnessAllIntraTest : public SharpnessTest {};
+
+class SharpnessAllIntraTestLarge : public SharpnessTest {};
+
+TEST_P(SharpnessTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTest, SharpnessPSNRTest) { DoTest(); }
+
+TEST_P(SharpnessAllIntraTestLarge, SharpnessPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessTestLarge,
+                           ::testing::Values(::libaom_test::kTwoPassGood),
+                           ::testing::Values(2, 4, 6),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTest,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(6),   // cpu_used
+                           ::testing::Values(4));  // sharpness level
+
+AV1_INSTANTIATE_TEST_SUITE(SharpnessAllIntraTestLarge,
+                           ::testing::Values(::libaom_test::kAllIntra),
+                           ::testing::Values(3, 6, 9),  // cpu_used
+                           ::testing::Values(2, 5));    // sharpness level
+}  // namespace

diff --git a/test/simd_impl.h b/test/simd_impl.h
index 61fda00..8535e37 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h

@@ -13,7 +13,6 @@
 
 #define SIMD_CHECK 1
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "aom_dsp/aom_simd_inline.h"
 #include "aom_dsp/simd/v256_intrinsics_c.h"
@@ -30,7 +29,7 @@
     name = std::get<2>(this->GetParam());
   }
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 
  protected:
   uint32_t mask, maskwidth;

diff --git a/test/sse_sum_test.cc b/test/sse_sum_test.cc
index 3e24e89..ecb89e4 100644
--- a/test/sse_sum_test.cc
+++ b/test/sse_sum_test.cc

@@ -21,7 +21,6 @@
 
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "test/function_equivalence_test.h"
@@ -50,10 +49,7 @@
     ASSERT_TRUE(src_ != NULL);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 9ac56fc..65724e1 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -17,7 +17,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "av1/common/blockd.h"
@@ -33,7 +32,7 @@
 
 class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual void TearDown() {}
 };
 
 using libaom_test::ACMRandom;

diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 4f26a3d..65fde3e 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc

@@ -21,7 +21,6 @@
 
 #include "aom_ports/mem.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "test/function_equivalence_test.h"
@@ -52,10 +51,7 @@
     ASSERT_TRUE(src_ != NULL);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -103,7 +99,7 @@
     }
     const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(res_tst =
+    API_REGISTER_STATE_CHECK(res_tst =
                                  params_.tst_func(src_, stride, width, height));
 
     if (!failed) {
@@ -208,7 +204,7 @@
 
     const uint64_t ref_res = params_.ref_func(src, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -229,7 +225,7 @@
 
     const uint64_t ref_res = params_.ref_func(src, N);
     uint64_t tst_res;
-    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+    API_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -268,7 +264,6 @@
   }
 
   virtual void TearDown() {
-    libaom_test::ClearSystemState();
     aom_free(src_);
     aom_free(ref_);
   }
@@ -453,10 +448,7 @@
     ASSERT_TRUE(src_ != NULL);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom, int width, int height, int run_times);
 
   void GenRandomData(int width, int height, int stride) {
@@ -599,10 +591,7 @@
     ASSERT_TRUE(src_ != NULL);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -651,7 +640,7 @@
 
     const uint64_t res_ref = params_.ref_func(src_, stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(res_tst =
+    API_REGISTER_STATE_CHECK(res_tst =
                                  params_.tst_func(src_, stride, width, height));
 
     if (!failed) {
@@ -730,10 +719,7 @@
     ASSERT_TRUE(src_ != NULL);
   }
 
-  virtual void TearDown() {
-    libaom_test::ClearSystemState();
-    aom_free(src_);
-  }
+  virtual void TearDown() { aom_free(src_); }
   void RunTest(int isRandom);
   void RunSpeedTest();
 
@@ -783,7 +769,7 @@
     const uint64_t res_ref =
         params_.ref_func(CONVERT_TO_BYTEPTR(src_), stride, width, height);
     uint64_t res_tst;
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         res_tst =
             params_.tst_func(CONVERT_TO_BYTEPTR(src_), stride, width, height));
 

diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 85e9594..243fcc1 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc

@@ -73,6 +73,7 @@
     memset(&layer_id_, 0, sizeof(aom_svc_layer_id_t));
     memset(&svc_params_, 0, sizeof(aom_svc_params_t));
     memset(&ref_frame_config_, 0, sizeof(aom_svc_ref_frame_config_t));
+    memset(&ref_frame_comp_pred_, 0, sizeof(aom_svc_ref_frame_comp_pred_t));
     drop_frames_ = 0;
     for (int i = 0; i < 1000; i++) drop_frames_list_[i] = 1000;
     decoded_nframes_ = 0;
@@ -80,6 +81,8 @@
     mismatch_psnr_ = 0.0;
     set_frame_level_er_ = 0;
     multi_ref_ = 0;
+    use_fixed_mode_svc_ = 0;
+    comp_pred_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
@@ -107,11 +110,16 @@
     }
     // Set the reference/update flags, layer_id, and reference_map
     // buffer index.
-    frame_flags_ =
-        set_layer_pattern(video->frame(), &layer_id_, &ref_frame_config_,
-                          spatial_layer_id, multi_ref_);
+    frame_flags_ = set_layer_pattern(video->frame(), &layer_id_,
+                                     &ref_frame_config_, &ref_frame_comp_pred_,
+                                     spatial_layer_id, multi_ref_, comp_pred_);
     encoder->Control(AV1E_SET_SVC_LAYER_ID, &layer_id_);
-    encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    // The SET_SVC_REF_FRAME_CONFIG and AV1E_SET_SVC_REF_FRAME_COMP_PRED api is
+    // for the flexible SVC mode (i.e., use_fixed_mode_svc == 0).
+    if (!use_fixed_mode_svc_) {
+      encoder->Control(AV1E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+      encoder->Control(AV1E_SET_SVC_REF_FRAME_COMP_PRED, &ref_frame_comp_pred_);
+    }
     if (set_frame_level_er_) {
       int mode =
           (layer_id_.spatial_layer_id > 0 || layer_id_.temporal_layer_id > 0);
@@ -165,13 +173,15 @@
   unsigned int GetDecodedFrames() { return decoded_nframes_; }
 
   // Layer pattern configuration.
-  virtual int set_layer_pattern(int frame_cnt, aom_svc_layer_id_t *layer_id,
-                                aom_svc_ref_frame_config_t *ref_frame_config,
-                                int spatial_layer, int multi_ref) {
+  virtual int set_layer_pattern(
+      int frame_cnt, aom_svc_layer_id_t *layer_id,
+      aom_svc_ref_frame_config_t *ref_frame_config,
+      aom_svc_ref_frame_comp_pred_t *ref_frame_comp_pred, int spatial_layer,
+      int multi_ref, int comp_pred) {
     int lag_index = 0;
     int base_count = frame_cnt >> 2;
     layer_id->spatial_layer_id = spatial_layer;
-    // Set the referende map buffer idx for the 7 references:
+    // Set the reference map buffer idx for the 7 references:
     // LAST_FRAME (0), LAST2_FRAME(1), LAST3_FRAME(2), GOLDEN_FRAME(3),
     // BWDREF_FRAME(4), ALTREF2_FRAME(5), ALTREF_FRAME(6).
     for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
@@ -179,6 +189,11 @@
       ref_frame_config->reference[i] = 0;
     }
     for (int i = 0; i < REF_FRAMES; i++) ref_frame_config->refresh[i] = 0;
+    if (comp_pred) {
+      ref_frame_comp_pred->use_comp_pred[0] = 1;  // GOLDEN_LAST
+      ref_frame_comp_pred->use_comp_pred[1] = 1;  // LAST2_LAST
+      ref_frame_comp_pred->use_comp_pred[2] = 1;  // ALTREF_LAST
+    }
     // Set layer_flags to 0 when using ref_frame_config->reference.
     int layer_flags = 0;
     // Always reference LAST.
@@ -459,7 +474,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Top temporal layers are non_reference, so exlcude them from
@@ -690,6 +705,48 @@
     }
   }
 
+  virtual void BasicRateTargetingFixedModeSVC3TL3SLHDTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+    const int bitrate_array[2] = { 600, 1200 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 3;
+    use_fixed_mode_svc_ = 1;
+    // SL0
+    const int bitrate_sl0 = 1 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[0] = 50 * bitrate_sl0 / 100;
+    target_layer_bitrate_[1] = 70 * bitrate_sl0 / 100;
+    target_layer_bitrate_[2] = bitrate_sl0;
+    // SL1
+    const int bitrate_sl1 = 3 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[3] = 50 * bitrate_sl1 / 100;
+    target_layer_bitrate_[4] = 70 * bitrate_sl1 / 100;
+    target_layer_bitrate_[5] = bitrate_sl1;
+    // SL2
+    const int bitrate_sl2 = 4 * cfg_.rc_target_bitrate / 8;
+    target_layer_bitrate_[6] = 50 * bitrate_sl2 / 100;
+    target_layer_bitrate_[7] = 70 * bitrate_sl2 / 100;
+    target_layer_bitrate_[8] = bitrate_sl2;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.70)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.45)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   virtual void BasicRateTargetingSVC3TL3SLHDMT2Test() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
@@ -939,7 +996,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -984,7 +1041,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1029,7 +1086,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1075,7 +1132,7 @@
     for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
       ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
           << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.30)
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.35)
           << " The datarate for the file is greater than target by too much!";
     }
     // Test that no mismatches have been found.
@@ -1085,6 +1142,40 @@
     EXPECT_EQ((int)GetMismatchFrames(), 0);
   }
 
+  virtual void BasicRateTargetingSVC3TL1SLMultiRefCompoundTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 0;
+
+    ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30,
+                                         1, 0, 400);
+    cfg_.g_w = 640;
+    cfg_.g_h = 480;
+    const int bitrate_array[2] = { 400, 800 };
+    cfg_.rc_target_bitrate = bitrate_array[GET_PARAM(4)];
+    ResetModel();
+    multi_ref_ = 1;
+    comp_pred_ = 1;
+    number_temporal_layers_ = 3;
+    number_spatial_layers_ = 1;
+    target_layer_bitrate_[0] = 50 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[1] = 70 * cfg_.rc_target_bitrate / 100;
+    target_layer_bitrate_[2] = cfg_.rc_target_bitrate;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    for (int i = 0; i < number_temporal_layers_ * number_spatial_layers_; i++) {
+      ASSERT_GE(effective_datarate_tl[i], target_layer_bitrate_[i] * 0.80)
+          << " The datarate for the file is lower than target by too much!";
+      ASSERT_LE(effective_datarate_tl[i], target_layer_bitrate_[i] * 1.60)
+          << " The datarate for the file is greater than target by too much!";
+    }
+  }
+
   int layer_frame_cnt_;
   int superframe_cnt_;
   int number_temporal_layers_;
@@ -1093,6 +1184,7 @@
   int target_layer_bitrate_[AOM_MAX_LAYERS];
   aom_svc_params_t svc_params_;
   aom_svc_ref_frame_config_t ref_frame_config_;
+  aom_svc_ref_frame_comp_pred_t ref_frame_comp_pred_;
   aom_svc_layer_id_t layer_id_;
   double effective_datarate_tl[AOM_MAX_LAYERS];
   unsigned int drop_frames_;
@@ -1102,6 +1194,8 @@
   double mismatch_psnr_;
   int set_frame_level_er_;
   int multi_ref_;
+  int use_fixed_mode_svc_;
+  int comp_pred_;
 };
 
 // Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial.
@@ -1143,6 +1237,12 @@
 }
 
 // Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
+// for fixed mode SVC.
+TEST_P(DatarateTestSVC, BasicRateTargetingFixedModeSVC3TL3SLHD) {
+  BasicRateTargetingFixedModeSVC3TL3SLHDTest();
+}
+
+// Check basic rate targeting for CBR, for 3 spatial, 3 temporal layers,
 // for 2 threads, 2 tile_columns, row-mt enabled.
 TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL3SLHDMT2) {
   BasicRateTargetingSVC3TL3SLHDMT2Test();
@@ -1206,6 +1306,13 @@
   BasicRateTargetingSVC3TL1SLDropAllEnhFrameERTest();
 }
 
+// Check basic rate targeting for CBR, for 3 temporal layers, 1 spatial layer,
+// with compound prediction on, for pattern with two additional refereces
+// (golden and altref), both updated on base TLO frames.
+TEST_P(DatarateTestSVC, BasicRateTargetingSVC3TL1SLMultiRefCompound) {
+  BasicRateTargetingSVC3TL1SLMultiRefCompoundTest();
+}
+
 AV1_INSTANTIATE_TEST_SUITE(DatarateTestSVC,
                            ::testing::Values(::libaom_test::kRealTime),
                            ::testing::Range(7, 10),

diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index d665c85..6b933d8 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc

@@ -24,7 +24,6 @@
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/temporal_filter.h"
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "test/function_equivalence_test.h"
@@ -72,7 +71,6 @@
   }
 
   virtual void TearDown() {
-    libaom_test::ClearSystemState();
     aom_free(src1_);
     aom_free(src2_);
   }
@@ -325,7 +323,6 @@
   }
 
   virtual void TearDown() {
-    libaom_test::ClearSystemState();
     aom_free(src1_);
     aom_free(src2_);
   }

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index b8cd5ce..686d055 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -1,3 +1,4 @@
+a0edab4ab4054127474074d967a33616ccdccc76 *hantro_collage_w176h144.yuv
 d5dfb0151c9051f8c85999255645d7a23916d3c0 *hantro_collage_w352h288.yuv
 b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 26b7f64399b84db4b4c9c915d743ec5c2619d4b9 *invalid-bug-1814.ivf
@@ -36,6 +37,8 @@
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-16437.ivf.res.2
 e821070cea8eb687be102a1a118e0341c2e9df69 *invalid-oss-fuzz-24706.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-oss-fuzz-24706.ivf.res
+c0c32af28c5c6672d14e76d197894723e8a07b07 *invalid-oss-fuzz-33030.ivf
+fb38337e7d6203618fcfce4bc2dc17d5a4f00638 *invalid-oss-fuzz-33030.ivf.res
 ccbe4081557eb44820a0e6337c4a094421826b9a *invalid-oss-fuzz-9288.ivf
 67c54283fe1a26ccf02cc991e4f9a1eea3ac5e78 *invalid-oss-fuzz-9288.ivf.res
 c0960f032484579f967881cc025b71cfd7a79ee1 *invalid-oss-fuzz-9463.ivf

diff --git a/test/test.cmake b/test/test.cmake
index a7b0c9a..3efcfac 100644
--- a/test/test.cmake
+++ b/test/test.cmake

@@ -19,17 +19,19 @@
 include("${AOM_ROOT}/test/test_data_util.cmake")
 
 set(AOM_UNIT_TEST_DATA_LIST_FILE "${AOM_ROOT}/test/test-data.sha1")
+set(AOM_IDE_TEST_FOLDER "test")
+set(AOM_IDE_TESTDATA_FOLDER "testdata")
 
 list(APPEND AOM_UNIT_TEST_WRAPPER_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
             "${AOM_ROOT}/test/test_libaom.cc")
 
 list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
             "${AOM_ROOT}/test/acm_random.h"
+            "${AOM_ROOT}/test/aom_image_test.cc"
             "${AOM_ROOT}/test/aom_integer_test.cc"
             "${AOM_ROOT}/test/av1_config_test.cc"
             "${AOM_ROOT}/test/av1_key_value_api_test.cc"
             "${AOM_ROOT}/test/block_test.cc"
-            "${AOM_ROOT}/test/clear_system_state.h"
             "${AOM_ROOT}/test/codec_factory.h"
             "${AOM_ROOT}/test/function_equivalence_test.h"
             "${AOM_ROOT}/test/log2_test.cc"
@@ -48,7 +50,7 @@
               "${AOM_ROOT}/test/decode_test_driver.h")
 endif()
 
-if(CONFIG_INTERNAL_STATS)
+if(CONFIG_INTERNAL_STATS AND CONFIG_AV1_HIGHBITDEPTH)
   list(APPEND AOM_UNIT_TEST_COMMON_SOURCES
               "${AOM_ROOT}/test/hbd_metrics_test.cc")
 endif()
@@ -59,12 +61,19 @@
             "${AOM_ROOT}/test/invalid_file_test.cc"
             "${AOM_ROOT}/test/test_vector_test.cc"
             "${AOM_ROOT}/test/ivf_video_source.h")
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_UNIT_TEST_DECODER_SOURCES
+                   "${AOM_ROOT}/test/invalid_file_test.cc"
+                   "${AOM_ROOT}/test/test_vector_test.cc")
+endif()
 
 list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
             "${AOM_ROOT}/test/active_map_test.cc"
             "${AOM_ROOT}/test/aq_segment_test.cc"
+            "${AOM_ROOT}/test/av1_external_partition_test.cc"
             "${AOM_ROOT}/test/borders_test.cc"
             "${AOM_ROOT}/test/cpu_speed_test.cc"
+            "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
             "${AOM_ROOT}/test/datarate_test.cc"
             "${AOM_ROOT}/test/datarate_test.h"
             "${AOM_ROOT}/test/svc_datarate_test.cc"
@@ -72,7 +81,7 @@
             "${AOM_ROOT}/test/encode_small_width_height_test.cc"
             "${AOM_ROOT}/test/encode_test_driver.cc"
             "${AOM_ROOT}/test/encode_test_driver.h"
-            "${AOM_ROOT}/test/end_to_end_test.cc"
+            "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
             "${AOM_ROOT}/test/gf_pyr_height_test.cc"
             "${AOM_ROOT}/test/rt_end_to_end_test.cc"
             "${AOM_ROOT}/test/frame_size_tests.cc"
@@ -82,17 +91,35 @@
             "${AOM_ROOT}/test/monochrome_test.cc"
             "${AOM_ROOT}/test/resize_test.cc"
             "${AOM_ROOT}/test/scalability_test.cc"
+            "${AOM_ROOT}/test/sharpness_test.cc"
             "${AOM_ROOT}/test/y4m_test.cc"
             "${AOM_ROOT}/test/y4m_video_source.h"
             "${AOM_ROOT}/test/yuv_video_source.h"
             "${AOM_ROOT}/test/time_stamp_test.cc")
 
+if(CONFIG_REALTIME_ONLY)
+  list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                   "${AOM_ROOT}/test/av1_external_partition_test.cc"
+                   "${AOM_ROOT}/test/borders_test.cc"
+                   "${AOM_ROOT}/test/cpu_speed_test.cc"
+                   "${AOM_ROOT}/test/cpu_used_firstpass_test.cc"
+                   "${AOM_ROOT}/test/end_to_end_psnr_test.cc"
+                   "${AOM_ROOT}/test/gf_pyr_height_test.cc"
+                   "${AOM_ROOT}/test/horz_superres_test.cc"
+                   "${AOM_ROOT}/test/level_test.cc"
+                   "${AOM_ROOT}/test/monochrome_test.cc"
+                   "${AOM_ROOT}/test/sharpness_test.cc")
+endif()
+
 if(CONFIG_AV1_TEMPORAL_DENOISING AND (HAVE_SSE2 OR HAVE_NEON))
   list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
               "${AOM_ROOT}/test/av1_temporal_denoiser_test.cc")
 endif()
 
-list(APPEND AOM_DECODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/decode_perf_test.cc")
+if(NOT CONFIG_REALTIME_ONLY)
+  list(APPEND AOM_DECODE_PERF_TEST_SOURCES
+              "${AOM_ROOT}/test/decode_perf_test.cc")
+endif()
 list(APPEND AOM_ENCODE_PERF_TEST_SOURCES "${AOM_ROOT}/test/encode_perf_test.cc")
 list(APPEND AOM_UNIT_TEST_WEBM_SOURCES "${AOM_ROOT}/test/webm_video_source.h")
 list(APPEND AOM_TEST_INTRA_PRED_SPEED_SOURCES "${AOM_GEN_SRC_DIR}/usage_exit.c"
@@ -140,17 +167,33 @@
                 "${AOM_ROOT}/test/kf_test.cc"
                 "${AOM_ROOT}/test/lossless_test.cc"
                 "${AOM_ROOT}/test/quant_test.cc"
+                "${AOM_ROOT}/test/ratectrl_test.cc"
+                "${AOM_ROOT}/test/rd_test.cc"
                 "${AOM_ROOT}/test/sb_multipass_test.cc"
                 "${AOM_ROOT}/test/screen_content_test.cc"
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/still_picture_test.cc"
+                "${AOM_ROOT}/test/temporal_filter_test.cc"
                 "${AOM_ROOT}/test/tile_config_test.cc"
                 "${AOM_ROOT}/test/tile_independence_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_test.cc")
+                "${AOM_ROOT}/test/tpl_model_test.cc")
     if(CONFIG_REALTIME_ONLY)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
+                       "${AOM_ROOT}/test/altref_test.cc"
+                       "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc"
+                       "${AOM_ROOT}/test/av1_ext_tile_test.cc"
                        "${AOM_ROOT}/test/cnn_test.cc"
-                       "${AOM_ROOT}/test/selfguided_filter_test.cc")
+                       "${AOM_ROOT}/test/decode_multithreaded_test.cc"
+                       "${AOM_ROOT}/test/error_resilience_test.cc"
+                       "${AOM_ROOT}/test/fwd_kf_test.cc"
+                       "${AOM_ROOT}/test/kf_test.cc"
+                       "${AOM_ROOT}/test/lossless_test.cc"
+                       "${AOM_ROOT}/test/sb_multipass_test.cc"
+                       "${AOM_ROOT}/test/selfguided_filter_test.cc"
+                       "${AOM_ROOT}/test/screen_content_test.cc"
+                       "${AOM_ROOT}/test/still_picture_test.cc"
+                       "${AOM_ROOT}/test/tile_independence_test.cc"
+                       "${AOM_ROOT}/test/tpl_model_test.cc")
     endif()
     if(NOT CONFIG_AV1_HIGHBITDEPTH)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
@@ -205,6 +248,7 @@
               "${AOM_ROOT}/test/av1_inv_txfm2d_test.cc"
               "${AOM_ROOT}/test/av1_nn_predict_test.cc"
               "${AOM_ROOT}/test/av1_round_shift_array_test.cc"
+              "${AOM_ROOT}/test/av1_softmax_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.cc"
               "${AOM_ROOT}/test/av1_txfm_test.h"
               "${AOM_ROOT}/test/av1_wedge_utils_test.cc"
@@ -215,9 +259,12 @@
               "${AOM_ROOT}/test/comp_avg_pred_test.h"
               "${AOM_ROOT}/test/comp_mask_variance_test.cc"
               "${AOM_ROOT}/test/edge_detect_test.cc"
+              "${AOM_ROOT}/test/encodemb_test.cc"
               "${AOM_ROOT}/test/encodetxb_test.cc"
+              "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
               "${AOM_ROOT}/test/error_block_test.cc"
               "${AOM_ROOT}/test/fft_test.cc"
+              "${AOM_ROOT}/test/firstpass_test.cc"
               "${AOM_ROOT}/test/fwht4x4_test.cc"
               "${AOM_ROOT}/test/fdct4x4_test.cc"
               "${AOM_ROOT}/test/hadamard_test.cc"
@@ -230,6 +277,7 @@
               "${AOM_ROOT}/test/obmc_variance_test.cc"
               "${AOM_ROOT}/test/pickrst_test.cc"
               "${AOM_ROOT}/test/quantize_func_test.cc"
+              "${AOM_ROOT}/test/quantize_lp_func_test.cc"
               "${AOM_ROOT}/test/sad_test.cc"
               "${AOM_ROOT}/test/subtract_test.cc"
               "${AOM_ROOT}/test/reconinter_test.cc"
@@ -246,7 +294,10 @@
 
   if(CONFIG_REALTIME_ONLY)
     list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
+                     "${AOM_ROOT}/test/end_to_end_ssim_test.cc"
+                     "${AOM_ROOT}/test/firstpass_test.cc"
                      "${AOM_ROOT}/test/frame_error_test.cc"
+                     "${AOM_ROOT}/test/motion_vector_test.cc"
                      "${AOM_ROOT}/test/obmc_sad_test.cc"
                      "${AOM_ROOT}/test/obmc_variance_test.cc"
                      "${AOM_ROOT}/test/pickrst_test.cc"
@@ -273,7 +324,8 @@
 
   if(NOT (HAVE_SSE2 OR HAVE_NEON))
     list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES
-                     "${AOM_ROOT}/test/quantize_func_test.cc")
+                     "${AOM_ROOT}/test/quantize_func_test.cc"
+                     "${AOM_ROOT}/test/quantize_lp_func_test.cc")
   endif()
 
   if(HAVE_SSE4_1)
@@ -310,6 +362,7 @@
   add_library(
     aom_gtest STATIC
     "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+  set_property(TARGET aom_gtest PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   if(MSVC OR WIN32)
     target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
   elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
@@ -329,21 +382,25 @@
   # list into separate object library targets, and then linking them into
   # test_libaom.
   add_library(test_aom_common OBJECT ${AOM_UNIT_TEST_COMMON_SOURCES})
+  set_property(TARGET test_aom_common PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   add_dependencies(test_aom_common aom)
 
   if(CONFIG_AV1_DECODER)
     add_library(test_aom_decoder OBJECT ${AOM_UNIT_TEST_DECODER_SOURCES})
+    set_property(TARGET test_aom_decoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
     add_dependencies(test_aom_decoder aom)
   endif()
 
   if(CONFIG_AV1_ENCODER)
     add_library(test_aom_encoder OBJECT ${AOM_UNIT_TEST_ENCODER_SOURCES})
+    set_property(TARGET test_aom_encoder PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
     add_dependencies(test_aom_encoder aom)
   endif()
 
   add_executable(test_libaom ${AOM_UNIT_TEST_WRAPPER_SOURCES}
                              $<TARGET_OBJECTS:aom_common_app_util>
                              $<TARGET_OBJECTS:test_aom_common>)
+  set_property(TARGET test_libaom PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
   list(APPEND AOM_APP_TARGETS test_libaom)
 
   if(CONFIG_AV1_DECODER)
@@ -367,6 +424,8 @@
       add_executable(test_intra_pred_speed
                      ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
                      $<TARGET_OBJECTS:aom_common_app_util>)
+      set_property(TARGET test_intra_pred_speed
+                   PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
       target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
                             aom_gtest)
       list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
@@ -425,12 +484,15 @@
                 -DAOM_TEST_FILE="${test_file}"
                 -DAOM_TEST_CHECKSUM=${test_file_checksum} -P
                 "${AOM_ROOT}/test/test_data_download_worker.cmake")
+      set_property(TARGET testdata_${test_index}
+                   PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
       list(APPEND testdata_targets testdata_${test_index})
     endforeach()
 
     # Create a custom build target for running each test data download target.
     add_custom_target(testdata)
     add_dependencies(testdata ${testdata_targets})
+    set_property(TARGET testdata PROPERTY FOLDER ${AOM_IDE_TESTDATA_FOLDER})
 
     # Skip creation of test run targets when generating for Visual Studio and
     # Xcode unless the user explicitly requests IDE test hosting. This is done
@@ -456,9 +518,11 @@
                                   -DTEST_LIBAOM=$<TARGET_FILE:test_libaom> -P
                                   "${AOM_ROOT}/test/test_runner.cmake"
                           DEPENDS testdata test_libaom)
+        set_property(TARGET ${test_name} PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
         list(APPEND test_targets ${test_name})
       endforeach()
       add_custom_target(runtests)
+      set_property(TARGET runtests PROPERTY FOLDER ${AOM_IDE_TEST_FOLDER})
       add_dependencies(runtests ${test_targets})
     endif()
   endif()

diff --git a/test/test_data_util.cmake b/test/test_data_util.cmake
index 142a137..e189607 100644
--- a/test/test_data_util.cmake
+++ b/test/test_data_util.cmake

@@ -11,6 +11,7 @@
 
 list(APPEND AOM_TEST_DATA_FILE_NAMES
             "desktop1.320_180.yuv"
+            "hantro_collage_w176h144.yuv"
             "hantro_collage_w352h288.yuv"
             "hantro_odd.yuv"
             "paris_352_288_30.y4m"
@@ -555,6 +556,8 @@
               "invalid-oss-fuzz-16437.ivf.res.2"
               "invalid-oss-fuzz-24706.ivf"
               "invalid-oss-fuzz-24706.ivf.res"
+              "invalid-oss-fuzz-33030.ivf"
+              "invalid-oss-fuzz-33030.ivf.res"
               "invalid-oss-fuzz-9288.ivf"
               "invalid-oss-fuzz-9288.ivf.res"
               "invalid-oss-fuzz-9463.ivf"

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 54bdbcb..796477e 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -19,7 +19,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/md5_helper.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
@@ -135,7 +134,6 @@
       pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
                     intra_pred_test_mem.above, intra_pred_test_mem.left);
     }
-    libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);
@@ -945,7 +943,6 @@
       pred_funcs[k](intra_pred_test_mem.src, intra_pred_test_mem.stride,
                     intra_pred_test_mem.above, intra_pred_test_mem.left, bd);
     }
-    libaom_test::ClearSystemState();
     aom_usec_timer_mark(&timer);
     const int elapsed_time =
         static_cast<int>(aom_usec_timer_elapsed(&timer) / 1000);

diff --git a/test/tile_config_test.cc b/test/tile_config_test.cc
index 0098903..517d54b 100644
--- a/test/tile_config_test.cc
+++ b/test/tile_config_test.cc

@@ -28,6 +28,14 @@
   const unsigned int tile_cols;
 } uniformTileConfigParam;
 
+const libaom_test::TestMode kTestModeParams[] =
+#if CONFIG_REALTIME_ONLY
+    { ::libaom_test::kRealTime };
+#else
+    { ::libaom_test::kRealTime, ::libaom_test::kOnePassGood,
+      ::libaom_test::kTwoPassGood };
+#endif
+
 static const uniformTileConfigParam uniformTileConfigParams[] = {
   { 128, 0, 0 }, { 128, 0, 2 }, { 128, 2, 0 }, { 128, 1, 2 }, { 128, 2, 2 },
   { 128, 3, 2 }, { 64, 0, 0 },  { 64, 0, 2 },  { 64, 2, 0 },  { 64, 1, 2 },
@@ -254,14 +262,12 @@
 }
 
 AV1_INSTANTIATE_TEST_SUITE(UniformTileConfigTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(uniformTileConfigParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 
 AV1_INSTANTIATE_TEST_SUITE(NonUniformTileConfigTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(nonUniformTileConfigParams),
                            ::testing::Values(AOM_Q, AOM_VBR, AOM_CBR, AOM_CQ));
 
@@ -352,7 +358,6 @@
 }
 
 AV1_INSTANTIATE_TEST_SUITE(TileGroupTestLarge,
-                           ::testing::Values(::libaom_test::kOnePassGood,
-                                             ::libaom_test::kTwoPassGood),
+                           ::testing::ValuesIn(kTestModeParams),
                            ::testing::ValuesIn(tileGroupTestParams));
 }  // namespace

diff --git a/test/time_stamp_test.cc b/test/time_stamp_test.cc
index 205e5ba..baa0dc0 100644
--- a/test/time_stamp_test.cc
+++ b/test/time_stamp_test.cc

@@ -95,8 +95,13 @@
   video.set_starting_pts(922337170351ll);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
-
+#if CONFIG_REALTIME_ONLY
 AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
-                           ::testing::Values(::libaom_test::kTwoPassGood));
+                           ::testing::Values(::libaom_test::kRealTime));
+#else
+AV1_INSTANTIATE_TEST_SUITE(TimestampTest,
+                           ::testing::Values(::libaom_test::kRealTime,
+                                             ::libaom_test::kTwoPassGood));
+#endif
 
 }  // namespace

diff --git a/test/tools_common.sh b/test/tools_common.sh
index d40709b..4722422 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh

@@ -202,12 +202,43 @@
         --test-decode=fatal"
 }
 
+# Echoes realtime encode params for use with aomenc.
+aomenc_encode_test_rt_params() {
+  echo "--limit=${AV1_ENCODE_TEST_FRAME_LIMIT}
+        --test-decode=fatal
+        --enable-tpl-model=0
+        --deltaq-mode=0
+        --enable-order-hint=0
+        --profile=0
+        --static-thresh=0
+        --end-usage=cbr
+        --cpu-used=7
+        --passes=1
+        --usage=1
+        --lag-in-frames=0
+        --aq-mode=3
+        --enable-obmc=0
+        --enable-warped-motion=0
+        --enable-ref-frame-mvs=0
+        --enable-cdef=1
+        --enable-order-hint=0
+        --coeff-cost-upd-freq=3
+        --mode-cost-upd-freq=3
+        --mv-cost-upd-freq=3"
+}
+
 # Echoes yes to stdout when aom_config_option_enabled() reports yes for
 # CONFIG_WEBM_IO.
 webm_io_available() {
   [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
 }
 
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
+# CONFIG_REALTIME_ONLY.
+realtime_only_build() {
+  [ "$(aom_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ] && echo yes
+}
+
 # Filters strings from $1 using the filter specified by $2. Filter behavior
 # depends on the presence of $3. When $3 is present, strings that match the
 # filter are excluded. When $3 is omitted, strings matching the filter are

diff --git a/test/tpl_model_test.cc b/test/tpl_model_test.cc
new file mode 100644
index 0000000..da5a821
--- /dev/null
+++ b/test/tpl_model_test.cc

@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <vector>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/tpl_model.h"
+#include "av1/encoder/encoder.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace {
+
+double laplace_prob(double q_step, double b, double zero_bin_ratio,
+                    int qcoeff) {
+  int abs_qcoeff = abs(qcoeff);
+  double z0 = fmax(exp(-zero_bin_ratio / 2 * q_step / b), TPL_EPSILON);
+  if (abs_qcoeff == 0) {
+    double p0 = 1 - z0;
+    return p0;
+  } else {
+    assert(abs_qcoeff > 0);
+    double z = fmax(exp(-q_step / b), TPL_EPSILON);
+    double p = z0 / 2 * (1 - z) * pow(z, abs_qcoeff - 1);
+    return p;
+  }
+}
+TEST(TplModelTest, ExponentialEntropyBoundaryTest1) {
+  double b = 0;
+  double q_step = 1;
+  double entropy = av1_exponential_entropy(q_step, b);
+  EXPECT_NEAR(entropy, 0, 0.00001);
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest1) {
+  // Check the consistency between av1_estimate_coeff_entropy() and
+  // laplace_prob()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  for (int qcoeff = -256; qcoeff < 256; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    double ref_rate = -log2(prob);
+    EXPECT_DOUBLE_EQ(rate, ref_rate);
+  }
+}
+
+TEST(TplModelTest, TransformCoeffEntropyTest2) {
+  // Check the consistency between av1_estimate_coeff_entropy(), laplace_prob()
+  // and av1_laplace_entropy()
+  double b = 1;
+  double q_step = 1;
+  double zero_bin_ratio = 2;
+  double est_expected_rate = 0;
+  for (int qcoeff = -20; qcoeff < 20; ++qcoeff) {
+    double rate = av1_estimate_coeff_entropy(q_step, b, zero_bin_ratio, qcoeff);
+    double prob = laplace_prob(q_step, b, zero_bin_ratio, qcoeff);
+    est_expected_rate += prob * rate;
+  }
+  double expected_rate = av1_laplace_entropy(q_step, b, zero_bin_ratio);
+  EXPECT_NEAR(expected_rate, est_expected_rate, 0.001);
+}
+
+TEST(TplModelTest, DeltaRateCostZeroFlow) {
+  // When srcrf_dist equal to recrf_dist, av1_delta_rate_cost should return 0
+  int64_t srcrf_dist = 256;
+  int64_t recrf_dist = 256;
+  int64_t delta_rate = 512;
+  int pixel_num = 256;
+  int64_t rate_cost =
+      av1_delta_rate_cost(delta_rate, recrf_dist, srcrf_dist, pixel_num);
+  EXPECT_EQ(rate_cost, 0);
+}
+
+// a reference function of av1_delta_rate_cost() with delta_rate using bit as
+// basic unit
+double ref_delta_rate_cost(int64_t delta_rate, double src_rec_ratio,
+                           int pixel_count) {
+  assert(src_rec_ratio <= 1 && src_rec_ratio >= 0);
+  double bits_per_pixel = (double)delta_rate / pixel_count;
+  double p = pow(2, bits_per_pixel);
+  double flow_rate_per_pixel =
+      sqrt(p * p / (src_rec_ratio * p * p + (1 - src_rec_ratio)));
+  double rate_cost = pixel_count * log2(flow_rate_per_pixel);
+  return rate_cost;
+}
+
+TEST(TplModelTest, DeltaRateCostReference) {
+  const int64_t scale = TPL_DEP_COST_SCALE_LOG2 + AV1_PROB_COST_SHIFT;
+  std::vector<int64_t> srcrf_dist_arr = { 256, 257, 312 };
+  std::vector<int64_t> recrf_dist_arr = { 512, 288, 620 };
+  std::vector<int64_t> delta_rate_arr = { 10, 278, 100 };
+  for (size_t t = 0; t < srcrf_dist_arr.size(); ++t) {
+    int64_t srcrf_dist = srcrf_dist_arr[t];
+    int64_t recrf_dist = recrf_dist_arr[t];
+    int64_t delta_rate = delta_rate_arr[t];
+    int64_t scaled_delta_rate = delta_rate << scale;
+    int pixel_count = 256;
+    int64_t rate_cost = av1_delta_rate_cost(scaled_delta_rate, recrf_dist,
+                                            srcrf_dist, pixel_count);
+    rate_cost >>= scale;
+    double src_rec_ratio = (double)srcrf_dist / recrf_dist;
+    double ref_rate_cost =
+        ref_delta_rate_cost(delta_rate, src_rec_ratio, pixel_count);
+    EXPECT_NEAR((double)rate_cost, ref_rate_cost, 1);
+  }
+}
+
+TEST(TplModelTest, GetOverlapAreaHasOverlap) {
+  // The block a's area is [10, 17) x [18, 24).
+  // The block b's area is [8, 15) x [17, 23).
+  // The overlapping area between block a and block b is [10, 15) x [18, 23).
+  // Therefore, the size of the area is (15 - 10) * (23 - 18) = 25.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 8;
+  int col_b = 17;
+  int height = 7;
+  int width = 6;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 25);
+}
+
+TEST(TplModelTest, GetOverlapAreaNoOverlap) {
+  // The block a's area is [10, 14) x [18, 22).
+  // The block b's area is [5, 9) x [5, 9).
+  // Threre is no overlapping area between block a and block b.
+  // Therefore, the return value should be zero.
+  int row_a = 10;
+  int col_a = 18;
+  int row_b = 5;
+  int col_b = 5;
+  int height = 4;
+  int width = 4;
+  int overlap_area =
+      av1_get_overlap_area(row_a, col_a, row_b, col_b, width, height);
+  EXPECT_EQ(overlap_area, 0);
+}
+
+TEST(TPLModelTest, EstimateFrameRateTest) {
+  /*
+   * Transform size: 16x16
+   * Frame count: 16
+   * Transform block count: 20
+   */
+  const int txfm_size = 256;  // 16x16
+  const int frame_count = 16;
+  int q_index_list[16];
+  int valid_list[16];
+  TplTxfmStats stats_list[16];
+
+  for (int i = 0; i < frame_count; i++) {
+    q_index_list[i] = 1;
+    valid_list[i] = 1;
+    stats_list[i].txfm_block_count = 8;
+
+    for (int j = 0; j < txfm_size; j++) {
+      stats_list[i].abs_coeff_sum[j] = 0;
+    }
+  }
+
+  double result = av1_estimate_gop_bitrate(q_index_list, frame_count,
+                                           stats_list, valid_list, NULL);
+  EXPECT_NEAR(result, 0, 0.1);
+}
+
+TEST(TPLModelTest, TxfmStatsInitTest) {
+  TplTxfmStats tpl_txfm_stats;
+  av1_init_tpl_txfm_stats(&tpl_txfm_stats);
+  EXPECT_EQ(tpl_txfm_stats.coeff_num, 256);
+  EXPECT_EQ(tpl_txfm_stats.txfm_block_count, 0);
+  for (int i = 0; i < tpl_txfm_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(tpl_txfm_stats.abs_coeff_sum[i], 0);
+  }
+}
+
+TEST(TPLModelTest, TxfmStatsAccumulateTest) {
+  TplTxfmStats sub_stats;
+  av1_init_tpl_txfm_stats(&sub_stats);
+  sub_stats.txfm_block_count = 17;
+  for (int i = 0; i < sub_stats.coeff_num; ++i) {
+    sub_stats.abs_coeff_sum[i] = i;
+  }
+
+  TplTxfmStats accumulated_stats;
+  av1_init_tpl_txfm_stats(&accumulated_stats);
+  accumulated_stats.txfm_block_count = 13;
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    accumulated_stats.abs_coeff_sum[i] = 5 * i;
+  }
+
+  av1_accumulate_tpl_txfm_stats(&sub_stats, &accumulated_stats);
+  EXPECT_DOUBLE_EQ(accumulated_stats.txfm_block_count, 30);
+  for (int i = 0; i < accumulated_stats.coeff_num; ++i) {
+    EXPECT_DOUBLE_EQ(accumulated_stats.abs_coeff_sum[i], 6 * i);
+  }
+}
+
+TEST(TPLModelTest, TxfmStatsRecordTest) {
+  TplTxfmStats stats1;
+  TplTxfmStats stats2;
+  av1_init_tpl_txfm_stats(&stats1);
+  av1_init_tpl_txfm_stats(&stats2);
+
+  tran_low_t coeff[256];
+  for (int i = 0; i < 256; ++i) {
+    coeff[i] = i;
+  }
+  av1_record_tpl_txfm_block(&stats1, coeff);
+  EXPECT_EQ(stats1.txfm_block_count, 1);
+
+  // we record the same transform block twice for testing purpose
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  av1_record_tpl_txfm_block(&stats2, coeff);
+  EXPECT_EQ(stats2.txfm_block_count, 2);
+
+  EXPECT_EQ(stats1.coeff_num, 256);
+  EXPECT_EQ(stats2.coeff_num, 256);
+  for (int i = 0; i < 256; ++i) {
+    EXPECT_DOUBLE_EQ(stats2.abs_coeff_sum[i], 2 * stats1.abs_coeff_sum[i]);
+  }
+}
+
+/*
+ * Helper method to brute-force search for the closest q_index
+ * that achieves the specified bit budget.
+ */
+int find_gop_q_iterative(double bit_budget, const double *qstep_ratio_list,
+                         GF_GROUP gf_group, const int *stats_valid_list,
+                         TplTxfmStats *stats_list, int gf_frame_index,
+                         aom_bit_depth_t bit_depth) {
+  // Brute force iterative method to find the optimal q.
+  // Use the result to test against the binary search result.
+
+  // Initial estimate when q = 255
+  av1_q_mode_compute_gop_q_indices(gf_frame_index, 255, qstep_ratio_list,
+                                   bit_depth, &gf_group, gf_group.q_val);
+  double curr_estimate = av1_estimate_gop_bitrate(
+      gf_group.q_val, gf_group.size, stats_list, stats_valid_list, NULL);
+  double best_estimate_budget_distance = fabs(curr_estimate - bit_budget);
+  int best_q = 255;
+
+  // Start at q = 254 because we already have an estimate for q = 255.
+  for (int q = 254; q >= 0; q--) {
+    av1_q_mode_compute_gop_q_indices(gf_frame_index, q, qstep_ratio_list,
+                                     bit_depth, &gf_group, gf_group.q_val);
+    curr_estimate = av1_estimate_gop_bitrate(
+        gf_group.q_val, gf_group.size, stats_list, stats_valid_list, NULL);
+    double curr_estimate_budget_distance = fabs(curr_estimate - bit_budget);
+    if (curr_estimate_budget_distance <= best_estimate_budget_distance) {
+      best_estimate_budget_distance = curr_estimate_budget_distance;
+      best_q = q;
+    }
+  }
+  return best_q;
+}
+
+TEST(TplModelTest, QModeEstimateBaseQTest) {
+  GF_GROUP gf_group = {};
+  gf_group.size = 25;
+  TplTxfmStats stats_list[25];
+  int q_index_list[25];
+  const int gf_group_update_types[25] = { 0, 3, 6, 6, 6, 1, 5, 1, 5, 6, 1, 5, 1,
+                                          5, 6, 6, 1, 5, 1, 5, 6, 1, 5, 1, 4 };
+  int stats_valid_list[25] = { 0 };
+  const int gf_frame_index = 0;
+  const aom_bit_depth_t bit_depth = AOM_BITS_8;
+  const double scale_factor = 1.0;
+
+  double qstep_ratio_list[25];
+  for (int i = 0; i < 25; i++) {
+    qstep_ratio_list[i] = 1;
+  }
+
+  for (int i = 0; i < gf_group.size; i++) {
+    stats_valid_list[i] = 1;
+    gf_group.update_type[i] = gf_group_update_types[i];
+    stats_list[i].txfm_block_count = 8;
+
+    for (int j = 0; j < 256; j++) {
+      stats_list[i].abs_coeff_sum[j] = 1000 + j;
+    }
+  }
+
+  // Test multiple bit budgets.
+  const std::vector<double> bit_budgets = { 0,      100,    1000,   10000,
+                                            100000, 300000, 500000, 750000,
+                                            800000, DBL_MAX };
+
+  for (double bit_budget : bit_budgets) {
+    // Binary search method to find the optimal q.
+    const int result = av1_q_mode_estimate_base_q(
+        &gf_group, stats_list, stats_valid_list, bit_budget, gf_frame_index,
+        bit_depth, scale_factor, qstep_ratio_list, q_index_list, NULL);
+    const int test_result = find_gop_q_iterative(
+        bit_budget, qstep_ratio_list, gf_group, stats_valid_list, stats_list,
+        gf_frame_index, bit_depth);
+
+    if (bit_budget == 0) {
+      EXPECT_EQ(result, 255);
+    } else if (bit_budget == DBL_MAX) {
+      EXPECT_EQ(result, 0);
+    }
+
+    EXPECT_EQ(result, test_result);
+  }
+}
+
+TEST(TplModelTest, ComputeMVDifferenceTest) {
+  TplDepFrame tpl_frame_small;
+  tpl_frame_small.is_valid = true;
+  tpl_frame_small.mi_rows = 4;
+  tpl_frame_small.mi_cols = 4;
+  tpl_frame_small.stride = 1;
+  uint8_t right_shift_small = 1;
+  int step_small = 1 << right_shift_small;
+
+  // Test values for motion vectors.
+  int mv_vals_small[4] = { 1, 2, 3, 4 };
+  int index = 0;
+
+  // 4x4 blocks means we need to allocate a 4 size array.
+  // According to av1_tpl_ptr_pos:
+  // (row >> right_shift) * stride + (col >> right_shift)
+  // (4 >> 1) * 1 + (4 >> 1) = 4
+  TplDepStats stats_buf_small[4];
+  tpl_frame_small.tpl_stats_ptr = stats_buf_small;
+
+  for (int row = 0; row < tpl_frame_small.mi_rows; row += step_small) {
+    for (int col = 0; col < tpl_frame_small.mi_cols; col += step_small) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals_small[index];
+      mv.as_mv.col = mv_vals_small[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame_small.tpl_stats_ptr[av1_tpl_ptr_pos(
+          row, col, tpl_frame_small.stride, right_shift_small)] = tpl_stats;
+    }
+  }
+
+  int_mv result_mv =
+      av1_compute_mv_difference(&tpl_frame_small, 1, 1, step_small,
+                                tpl_frame_small.stride, right_shift_small);
+
+  // Expect the result to be exactly equal to 1 because this is the difference
+  // between neighboring motion vectors in this instance.
+  EXPECT_EQ(result_mv.as_mv.row, 1);
+  EXPECT_EQ(result_mv.as_mv.col, 1);
+}
+
+TEST(TplModelTest, ComputeMVBitsTest) {
+  TplDepFrame tpl_frame;
+  tpl_frame.is_valid = true;
+  tpl_frame.mi_rows = 16;
+  tpl_frame.mi_cols = 16;
+  tpl_frame.stride = 24;
+  uint8_t right_shift = 2;
+  int step = 1 << right_shift;
+  // Test values for motion vectors.
+  int mv_vals_ordered[16] = { 1, 2,  3,  4,  5,  6,  7,  8,
+                              9, 10, 11, 12, 13, 14, 15, 16 };
+  int mv_vals[16] = { 1, 16, 2, 15, 3, 14, 4, 13, 5, 12, 6, 11, 7, 10, 8, 9 };
+  int index = 0;
+
+  // 16x16 blocks means we need to allocate a 100 size array.
+  // According to av1_tpl_ptr_pos:
+  // (row >> right_shift) * stride + (col >> right_shift)
+  // (16 >> 2) * 24 + (16 >> 2) = 100
+  TplDepStats stats_buf[100];
+  tpl_frame.tpl_stats_ptr = stats_buf;
+
+  for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals_ordered[index];
+      mv.as_mv.col = mv_vals_ordered[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+                                              right_shift)] = tpl_stats;
+    }
+  }
+
+  double result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+  // Expect the result to be low because the motion vectors are ordered.
+  // The estimation algorithm takes this into account and reduces the cost.
+  EXPECT_NEAR(result, 20, 5);
+
+  index = 0;
+  for (int row = 0; row < tpl_frame.mi_rows; row += step) {
+    for (int col = 0; col < tpl_frame.mi_cols; col += step) {
+      TplDepStats tpl_stats;
+      tpl_stats.ref_frame_index[0] = 0;
+      int_mv mv;
+      mv.as_mv.row = mv_vals[index];
+      mv.as_mv.col = mv_vals[index];
+      index++;
+      tpl_stats.mv[0] = mv;
+      tpl_frame.tpl_stats_ptr[av1_tpl_ptr_pos(row, col, tpl_frame.stride,
+                                              right_shift)] = tpl_stats;
+    }
+  }
+
+  result = av1_tpl_compute_frame_mv_entropy(&tpl_frame, right_shift);
+
+  // Expect the result to be higher because the vectors are not ordered.
+  // Neighboring vectors will have different values, increasing the cost.
+  EXPECT_NEAR(result, 70, 5);
+}
+
+}  // namespace

diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 68f5cc7..6897967 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h

@@ -80,12 +80,12 @@
         }
       }
 
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           RunFwdTxfm(test_input_block, test_temp_block, pitch_));
       if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+        API_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
       } else {
-        ASM_REGISTER_STATE_CHECK(
+        API_REGISTER_STATE_CHECK(
             RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
       }
 
@@ -148,7 +148,7 @@
       }
 
       fwd_txfm_ref(input_block, output_ref_block, stride, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+      API_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
 
       // The minimum quant value is 4.
       for (j = 0; j < height_; ++j) {
@@ -198,7 +198,7 @@
       fwd_txfm_ref(input_block, trans_block, pitch_, &txfm_param_);
 
       inv_txfm_ref(trans_block, output_ref_block, stride, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+      API_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
 
       for (j = 0; j < height_; ++j) {
         for (k = 0; k < pitch_; ++k) {
@@ -238,7 +238,7 @@
       }
 
       fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, &txfm_param_);
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           RunFwdTxfm(input_extreme_block, output_block, pitch_));
 
       int row_length = FindRowLength();
@@ -292,9 +292,9 @@
       fwd_txfm_ref(in, coeff, pitch_, &txfm_param_);
 
       if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+        API_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
       } else {
-        ASM_REGISTER_STATE_CHECK(
+        API_REGISTER_STATE_CHECK(
             RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
       }
 

diff --git a/test/variance_test.cc b/test/variance_test.cc
index fa90305..61a3fdf 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -20,7 +20,6 @@
 #include "config/aom_dsp_rtcd.h"
 
 #include "test/acm_random.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
@@ -338,7 +337,7 @@
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libaom_test::ClearSystemState(); }
+  virtual ~SumOfSquaresTest() {}
 
  protected:
   void ConstTest();
@@ -355,7 +354,7 @@
     for (int i = 0; i < 256; ++i) {
       mem[i] = v;
     }
-    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    API_REGISTER_STATE_CHECK(res = func_(mem));
     EXPECT_EQ(256u * (v * v), res);
   }
 }
@@ -369,7 +368,7 @@
 
     const unsigned int expected = mb_ss_ref(mem);
     unsigned int res;
-    ASM_REGISTER_STATE_CHECK(res = func_(mem));
+    API_REGISTER_STATE_CHECK(res = func_(mem));
     EXPECT_EQ(expected, res);
   }
 }
@@ -434,7 +433,6 @@
     aom_free(dst_);
     src_ = NULL;
     dst_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -503,9 +501,9 @@
       dst_[k] = rnd_.Rand8();
       src_[k] = rnd_.Rand8();
     }
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         mse_ref = aom_mse_wxh_16bit_c(dst_, dstride, src_, sstride, w, h));
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
     EXPECT_EQ(mse_ref, mse_mod)
         << "ref mse: " << mse_ref << " mod mse: " << mse_mod;
@@ -545,7 +543,6 @@
     delete[] ref_;
     src_ = NULL;
     ref_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -602,7 +599,7 @@
         for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
       }
       unsigned int sse, var;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var = params_.func(src_, width(), ref_, width(), &sse));
       EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
     }
@@ -623,7 +620,7 @@
     }
     unsigned int sse1, sse2, var1, var2;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         var1 = params_.func(src_, stride, ref_, stride, &sse1));
     var2 =
         variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
@@ -652,7 +649,7 @@
     unsigned int sse1, sse2;
     unsigned int var1, var2;
 
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
     var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
                         src_stride, ref_stride, &sse2, use_high_bit_depth(),
@@ -675,7 +672,7 @@
     aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
   }
   unsigned int sse, var, expected;
-  ASM_REGISTER_STATE_CHECK(
+  API_REGISTER_STATE_CHECK(
       var = params_.func(src_, width(), ref_, width(), &sse));
   expected = block_size() * 255 * 255 / 4;
   EXPECT_EQ(expected, var);
@@ -719,7 +716,7 @@
     }
     unsigned int sse1, sse2;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+    API_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
                  stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(sse1, sse2);
@@ -736,7 +733,7 @@
     unsigned int sse2;
     unsigned int var1;
     const int stride = width();
-    ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+    API_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
                  stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(var1, sse2);
@@ -748,7 +745,7 @@
   memset(src_, 255, block_size());
   memset(ref_, 0, block_size());
   unsigned int sse;
-  ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+  API_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, sse);
 }
@@ -758,7 +755,7 @@
   memset(src_, 255, block_size());
   memset(ref_, 0, block_size());
   unsigned int var;
-  ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+  API_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
   const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, var);
 }
@@ -805,7 +802,6 @@
       aom_free(CONVERT_TO_SHORTPTR(ref_));
       aom_free(CONVERT_TO_SHORTPTR(sec_));
     }
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -850,7 +846,7 @@
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
       const unsigned int var2 = subpel_variance_ref(
           ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
@@ -883,7 +879,7 @@
       }
       unsigned int sse1, sse2;
       unsigned int var1;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
       const unsigned int var2 = subpel_variance_ref(
           ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
@@ -968,7 +964,7 @@
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+      API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
                                                    src_, width(), &sse1, sec_));
       var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
                                      params_.log2height, x, y, &sse2,
@@ -1004,9 +1000,9 @@
         for (int y0 = 0; y0 < 4; ++y0) {
           uint32_t sse1, sse2;
           uint32_t var1, var2;
-          jcp_param_.fwd_offset = quant_dist_lookup_table[x0][y0][0];
-          jcp_param_.bck_offset = quant_dist_lookup_table[x0][y0][1];
-          ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
+          jcp_param_.fwd_offset = quant_dist_lookup_table[y0][x0];
+          jcp_param_.bck_offset = quant_dist_lookup_table[y0][1 - x0];
+          API_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
                                                        src_, width(), &sse1,
                                                        sec_, &jcp_param_));
           var2 = dist_wtd_subpel_avg_variance_ref(
@@ -1060,7 +1056,6 @@
     }
     aom_free(wsrc_);
     aom_free(mask_);
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -1100,7 +1095,7 @@
 
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
       var2 = obmc_subpel_variance_ref(
           pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
@@ -1137,7 +1132,7 @@
 
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(
+      API_REGISTER_STATE_CHECK(
           var1 = params_.func(pre_, width() + 1, x, y, wsrc_, mask_, &sse1));
       var2 = obmc_subpel_variance_ref(
           pre_, params_.log2width, params_.log2height, x, y, wsrc_, mask_,
@@ -1169,7 +1164,7 @@
   for (int i = 0; i < run_time; ++i) {
     int x = rnd_(8);
     int y = rnd_(8);
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         params_.func(pre_, stride, x, y, wsrc_, mask_, &sse1));
   }
   aom_usec_timer_mark(&timer);
@@ -1433,7 +1428,6 @@
     aom_free(dst_);
     src_ = NULL;
     dst_ = NULL;
-    libaom_test::ClearSystemState();
   }
 
  protected:
@@ -1501,9 +1495,9 @@
       dst_[k] = rnd_.Rand16() & mask();
       src_[k] = rnd_.Rand16() & mask();
     }
-    ASM_REGISTER_STATE_CHECK(mse_ref = aom_mse_wxh_16bit_highbd_c(
+    API_REGISTER_STATE_CHECK(mse_ref = aom_mse_wxh_16bit_highbd_c(
                                  dst_, dstride, src_, sstride, w, h));
-    ASM_REGISTER_STATE_CHECK(
+    API_REGISTER_STATE_CHECK(
         mse_mod = params_.func(dst_, dstride, src_, sstride, w, h));
     EXPECT_EQ(mse_ref, mse_mod)
         << "ref mse: " << mse_ref << " mod mse: " << mse_mod;

diff --git a/test/warp_filter_test_util.cc b/test/warp_filter_test_util.cc
index 07a2e3f..85fcb9c 100644
--- a/test/warp_filter_test_util.cc
+++ b/test/warp_filter_test_util.cc

@@ -109,7 +109,7 @@
 AV1WarpFilterTest::~AV1WarpFilterTest() {}
 void AV1WarpFilterTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
 
-void AV1WarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1WarpFilterTest::TearDown() {}
 
 void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
   const int w = 128, h = 128;
@@ -226,8 +226,8 @@
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               av1_warp_affine_c(mat, input, w, h, stride, output, 32, 32, out_w,
                                 out_h, out_w, sub_x, sub_y, &conv_params, alpha,
@@ -240,8 +240,8 @@
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                         out_w, sub_x, sub_y, &conv_params, alpha, beta, gamma,
@@ -301,7 +301,7 @@
   rnd_.Reset(ACMRandom::DeterministicSeed());
 }
 
-void AV1HighbdWarpFilterTest::TearDown() { libaom_test::ClearSystemState(); }
+void AV1HighbdWarpFilterTest::TearDown() {}
 
 void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
   const int w = 128, h = 128;
@@ -424,8 +424,8 @@
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
 
               av1_highbd_warp_affine_c(mat, input, w, h, stride, output, 32, 32,
@@ -441,8 +441,8 @@
                 conv_params.use_dist_wtd_comp_avg = 0;
               } else {
                 conv_params.use_dist_wtd_comp_avg = 1;
-                conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
-                conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+                conv_params.fwd_offset = quant_dist_lookup_table[jj][ii];
+                conv_params.bck_offset = quant_dist_lookup_table[jj][1 - ii];
               }
               test_impl(mat, input, w, h, stride, output2, 32, 32, out_w, out_h,
                         out_w, sub_x, sub_y, bd, &conv_params, alpha, beta,

diff --git a/test/warp_filter_test_util.h b/test/warp_filter_test_util.h
index 66a6e24..583f312 100644
--- a/test/warp_filter_test_util.h
+++ b/test/warp_filter_test_util.h

@@ -20,7 +20,6 @@
 #include "third_party/googletest/src/googletest/include/gtest/gtest.h"
 #include "test/acm_random.h"
 #include "test/util.h"
-#include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 
 #include "av1/common/mv.h"

diff --git a/tools/gop_bitrate/analyze_data.py b/tools/gop_bitrate/analyze_data.py
new file mode 100644
index 0000000..4e006b9
--- /dev/null
+++ b/tools/gop_bitrate/analyze_data.py

@@ -0,0 +1,18 @@
+with open('experiment.txt', 'r') as file:
+    lines = file.readlines()
+    curr_filename = ''
+    keyframe = 0
+    actual_value = 0
+    estimate_value = 0
+    print('filename, estimated value (b), actual value (b)')
+    for line in lines:
+        if line.startswith('input:'):
+            curr_filename = line[13:].strip()
+        if line.startswith('estimated'):
+            estimate_value = float(line[19:].strip())
+        if line.startswith('frame:'):
+            actual_value += float(line[line.find('size')+6:line.find('total')-2])
+        if line.startswith('****'):
+            print(f'{curr_filename}, {estimate_value}, {actual_value}')
+            estimate_value = 0
+            actual_value = 0

diff --git a/tools/gop_bitrate/encode_all_script.sh b/tools/gop_bitrate/encode_all_script.sh
new file mode 100755
index 0000000..0689b33
--- /dev/null
+++ b/tools/gop_bitrate/encode_all_script.sh

@@ -0,0 +1,13 @@
+#!/bin/bash
+#INPUT=media/cheer_sif.y4m
+OUTPUT=test.webm
+LIMIT=17
+CPU_USED=3
+CQ_LEVEL=36
+
+for input in media/*
+do
+  echo "****" >> experiment.txt
+  echo "input: $input" >> experiment.txt
+  ./aomenc --limit=$LIMIT --codec=av1 --cpu-used=$CPU_USED --end-usage=q --cq-level=$CQ_LEVEL --psnr --threads=0 --profile=0 --lag-in-frames=35 --min-q=0 --max-q=63 --auto-alt-ref=1 --passes=2 --kf-max-dist=160 --kf-min-dist=0 --drop-frame=0 --static-thresh=0 --minsection-pct=0 --maxsection-pct=2000 --arnr-maxframes=7 --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 --frame-parallel=0 --tile-columns=0 -o $OUTPUT $input >> experiment.txt
+done

diff --git a/tools/gop_bitrate/python/bitrate_accuracy.py b/tools/gop_bitrate/python/bitrate_accuracy.py
new file mode 100644
index 0000000..2a5da6a
--- /dev/null
+++ b/tools/gop_bitrate/python/bitrate_accuracy.py

@@ -0,0 +1,185 @@
+import numpy as np
+
+# Model A only.
+# Uses least squares regression to find the solution
+# when there is one unknown variable.
+def lstsq_solution(A, B):
+    A_inv = np.linalg.pinv(A)
+    x = np.matmul(A_inv, B)
+    return x[0][0]
+
+# Model B only.
+# Uses the pseudoinverse matrix to find the solution
+# when there are two unknown variables.
+def pinv_solution(A, mv, B):
+    new_A = np.concatenate((A, mv), axis=1)
+    new_A_inv = np.linalg.pinv(new_A)
+    new_x = np.matmul(new_A_inv, B)
+    print("pinv solution:", new_x[0][0], new_x[1][0])
+    return (new_x[0][0], new_x[1][0])
+
+# Model A only.
+# Finds the coefficient to multiply A by to minimize
+# the percentage error between A and B.
+def minimize_percentage_error_model_a(A, B):
+    R = np.divide(A, B)
+    num = 0
+    den = 0
+    best_x = 0
+    best_error = 100
+    for r_i in R:
+        num += r_i
+        den += r_i**2
+    if den == 0:
+        return 0
+    return (num/den)[0]
+
+# Model B only.
+# Finds the coefficients to multiply to the frame bitrate
+# and the motion vector bitrate to minimize the percent error.
+def minimize_percentage_error_model_b(r_e, r_m, r_f):
+    r_ef = np.divide(r_e, r_f)
+    r_mf = np.divide(r_m, r_f)
+    sum_ef = np.sum(r_ef)
+    sum_ef_sq = np.sum(np.square(r_ef))
+    sum_mf = np.sum(r_mf)
+    sum_mf_sq = np.sum(np.square(r_mf))
+    sum_ef_mf = np.sum(np.multiply(r_ef, r_mf))
+    # Divides x by y. If y is zero, returns 0.
+    divide = lambda x, y : 0 if y == 0 else x / y
+    # Set up and solve the matrix equation
+    A = np.array([[1, divide(sum_ef_mf, sum_ef_sq)],[divide(sum_ef_mf, sum_mf_sq), 1]])
+    B = np.array([divide(sum_ef, sum_ef_sq), divide(sum_mf, sum_mf_sq)])
+    A_inv = np.linalg.pinv(A)
+    x = np.matmul(A_inv, B)
+    return x
+
+# Model A only.
+# Calculates the least squares error between A and B
+# using coefficients in X.
+def average_lstsq_error(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error += (b - x*a)**2
+    if n == 0:
+        return None
+    error /= n
+    return error
+
+# Model A only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (abs(x*a-b)/b)*100
+        error += error_i
+    error /= n
+    return error
+
+# Model B only.
+# Calculates the average percentage error between A and B.
+def average_percent_error_model_b(A, M, B, x):
+    error = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        mv = M[i]
+        b = B[i][0]
+        if b == 0:
+            continue
+        estimate = x[0]*a
+        estimate += x[1]*mv
+        error += abs(estimate - b) / b
+    error *= 100
+    error /= A.shape[0]
+    return error
+
+def average_squared_error_model_a(A, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        if b == 0:
+            continue
+        n += 1
+        error_i = (1 - x*(a/b))**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
+
+def average_squared_error_model_b(A, M, B, x):
+    error = 0
+    n = 0
+    for i, a in enumerate(A):
+        a = a[0]
+        b = B[i][0]
+        mv = M[i]
+        if b == 0:
+            continue
+        n += 1
+        error_i = 1 - ((x[0]*a + x[1]*mv)/b)
+        error_i = error_i**2
+        error += error_i
+    error /= n
+    error = error**0.5
+    return error * 100
+
+# Traverses the data and prints out one value for
+# each update type.
+def print_solutions(file_path):
+    data = np.genfromtxt(file_path, delimiter="\t")
+    prev_update = 0
+    split_list_indices = list()
+    for i, val in enumerate(data):
+        if prev_update != val[3]:
+            split_list_indices.append(i)
+            prev_update = val[3]
+    split = np.split(data, split_list_indices)
+    for array in split:
+        A, mv, B, update = np.hsplit(array, 4)
+        z = np.where(B == 0)[0]
+        r_e = np.delete(A, z, axis=0)
+        r_m = np.delete(mv, z, axis=0)
+        r_f = np.delete(B, z, axis=0)
+        A = r_e
+        mv = r_m
+        B = r_f
+        all_zeros = not A.any()
+        if all_zeros:
+            continue
+        print("update type:", update[0][0])
+        x_ls = lstsq_solution(A, B)
+        x_a = minimize_percentage_error_model_a(A, B)
+        x_b = minimize_percentage_error_model_b(A, mv, B)
+        percent_error_a = average_percent_error_model_a(A, B, x_a)
+        percent_error_b = average_percent_error_model_b(A, mv, B, x_b)[0]
+        baseline_percent_error_a = average_percent_error_model_a(A, B, 1)
+        baseline_percent_error_b = average_percent_error_model_b(A, mv, B, [1, 1])[0]
+
+        squared_error_a = average_squared_error_model_a(A, B, x_a)
+        squared_error_b = average_squared_error_model_b(A, mv, B, x_b)[0]
+        baseline_squared_error_a = average_squared_error_model_a(A, B, 1)
+        baseline_squared_error_b = average_squared_error_model_b(A, mv, B, [1, 1])[0]
+
+        print("model,\tframe_coeff,\tmv_coeff,\terror,\tbaseline_error")
+        print("Model A %_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(percent_error_a) + ",\t" + str(baseline_percent_error_a))
+        print("Model A sq_error,\t" + str(x_a) + ",\t" + str(0) + ",\t" + str(squared_error_a) + ",\t" + str(baseline_squared_error_a))
+        print("Model B %_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(percent_error_b) + ",\t" + str(baseline_percent_error_b))
+        print("Model B sq_error,\t" + str(x_b[0]) + ",\t" + str(x_b[1]) + ",\t" + str(squared_error_b) + ",\t" + str(baseline_squared_error_b))
+        print()
+
+if __name__ == "__main__":
+    print_solutions("data2/all_lowres_target_lt600_data.txt")
commit	e2a712842f3c5c1fc1cd816d6b10bdd731ad5282	[log] [tgz]
author	Wan-Teh Chang <wtc@google.com>	Tue Oct 05 12:14:29 2021 -0700
committer	Wan-Teh Chang <wtc@google.com>	Tue Oct 05 12:15:40 2021 -0700
tree	fc4e9520c29de16c0bb21b8d24f6d987c5713ef5
parent	d50b2b81ca06be20cd18945be4bc04298b88a7cf [diff]
parent	ce9a40ce01ade9d6fea1721c82645804a2f39b00 [diff]